diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,27799 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 100, + "global_step": 4255, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1701.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 818.65625, + "completions/mean_terminated_length": 818.65625, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.47302141785621643, + "epoch": 0.0011750881316098707, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 0.0, + "loss": 0.0, + "num_tokens": 105866.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.0, + "rewards/e2e_recall_precision_mixed_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998794794082642, + "sampling/importance_sampling_ratio/min": 0.04267038777470589, + "sampling/sampling_logp_difference/max": 3.154250144958496, + "sampling/sampling_logp_difference/mean": 0.020557792857289314, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 2188.0, + "completions/max_terminated_length": 2159.5, + "completions/mean_length": 957.0234375, + "completions/mean_terminated_length": 950.7090911865234, + "completions/min_length": 425.5, + "completions/min_terminated_length": 425.5, + "entropy": 0.5007665604352951, + "epoch": 0.005875440658049354, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.9459405541419983, + "learning_rate": 1.5625e-08, + "loss": 0.0304, + "num_tokens": 591756.0, + "reward": 0.140625, + "reward_std": 0.1800631694495678, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.140625, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25662297010421753, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000313818454742, + "sampling/importance_sampling_ratio/min": 0.2443008739501238, + "sampling/sampling_logp_difference/max": 1.5048222541809082, + "sampling/sampling_logp_difference/mean": 0.02111760200932622, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 2233.0, + "completions/max_terminated_length": 2072.4, + "completions/mean_length": 950.425, + "completions/mean_terminated_length": 937.0094848632813, + "completions/min_length": 493.6, + "completions/min_terminated_length": 493.6, + "entropy": 0.5007948040962219, + "epoch": 0.011750881316098707, + "frac_reward_zero_std": 0.3, + "grad_norm": 1.1323678493499756, + "learning_rate": 3.515625e-08, + "loss": -0.0117, + "num_tokens": 1205448.0, + "reward": 0.1359375, + "reward_std": 0.1866186186671257, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.1359375, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2397767573595047, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000044345855713, + "sampling/importance_sampling_ratio/min": 0.23340302407741548, + "sampling/sampling_logp_difference/max": 1.5875318050384521, + "sampling/sampling_logp_difference/mean": 0.021055334806442262, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021875, + "completions/max_length": 2114.2, + "completions/max_terminated_length": 1918.0, + "completions/mean_length": 950.975, + "completions/mean_terminated_length": 918.5927612304688, + "completions/min_length": 437.6, + "completions/min_terminated_length": 437.6, + "entropy": 0.46904313564300537, + "epoch": 0.01762632197414806, + "frac_reward_zero_std": 0.3, + "grad_norm": 1.0092235803604126, + "learning_rate": 5.46875e-08, + "loss": 0.0036, + "num_tokens": 1803156.0, + "reward": 0.17921874821186065, + "reward_std": 0.1914873868227005, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.17921874821186065, + "rewards/e2e_recall_precision_mixed_reward/std": 0.33369612991809844, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998673915863037, + "sampling/importance_sampling_ratio/min": 0.15503151454031466, + "sampling/sampling_logp_difference/max": 2.8057526469230654, + "sampling/sampling_logp_difference/mean": 0.020007848739624023, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2388.8, + "completions/max_terminated_length": 2238.0, + "completions/mean_length": 1094.490625, + "completions/mean_terminated_length": 1051.4914428710938, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "entropy": 0.49076443910598755, + "epoch": 0.023501762632197415, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7749466896057129, + "learning_rate": 7.421874999999999e-08, + "loss": 0.0094, + "num_tokens": 2455897.0, + "reward": 0.10833333432674408, + "reward_std": 0.15549785941839217, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.10833333432674408, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2179785817861557, + "sampling/importance_sampling_ratio/max": 1.978114414215088, + "sampling/importance_sampling_ratio/mean": 0.9999550819396973, + "sampling/importance_sampling_ratio/min": 0.17499387562274932, + "sampling/sampling_logp_difference/max": 1.8573448657989502, + "sampling/sampling_logp_difference/mean": 0.02045784331858158, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2429.6, + "completions/max_terminated_length": 2392.4, + "completions/mean_length": 968.49375, + "completions/mean_terminated_length": 959.4304931640625, + "completions/min_length": 429.2, + "completions/min_terminated_length": 429.2, + "entropy": 0.47716065049171447, + "epoch": 0.02937720329024677, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4625045955181122, + "learning_rate": 9.375e-08, + "loss": 0.0078, + "num_tokens": 3099823.0, + "reward": 0.08489583395421504, + "reward_std": 0.09943832308053971, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.08489583395421504, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17509274780750275, + "sampling/importance_sampling_ratio/max": 1.9894845247268678, + "sampling/importance_sampling_ratio/mean": 0.9998233199119568, + "sampling/importance_sampling_ratio/min": 0.14390659239143133, + "sampling/sampling_logp_difference/max": 2.679186391830444, + "sampling/sampling_logp_difference/mean": 0.020350834354758263, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.028125, + "completions/max_length": 2164.0, + "completions/max_terminated_length": 2108.0, + "completions/mean_length": 952.025, + "completions/mean_terminated_length": 907.9393676757812, + "completions/min_length": 417.8, + "completions/min_terminated_length": 417.8, + "entropy": 0.4895743727684021, + "epoch": 0.03525264394829612, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0, + "learning_rate": 1.1328125e-07, + "loss": 0.0265, + "num_tokens": 3705635.0, + "reward": 0.109375, + "reward_std": 0.11092274188995362, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.109375, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20727644562721254, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999130725860595, + "sampling/importance_sampling_ratio/min": 0.19804175468862012, + "sampling/sampling_logp_difference/max": 3.9093191623687744, + "sampling/sampling_logp_difference/mean": 0.020513736456632615, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2022.4, + "completions/max_terminated_length": 1915.6, + "completions/mean_length": 915.609375, + "completions/mean_terminated_length": 892.3630615234375, + "completions/min_length": 491.6, + "completions/min_terminated_length": 491.6, + "entropy": 0.4711003482341766, + "epoch": 0.041128084606345476, + "frac_reward_zero_std": 0.15, + "grad_norm": 1.1989802122116089, + "learning_rate": 1.328125e-07, + "loss": 0.0196, + "num_tokens": 4306066.0, + "reward": 0.19791666865348817, + "reward_std": 0.2227712243795395, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.19791666865348817, + "rewards/e2e_recall_precision_mixed_reward/std": 0.292997220158577, + "sampling/importance_sampling_ratio/max": 1.9896412134170531, + "sampling/importance_sampling_ratio/mean": 0.9999082565307618, + "sampling/importance_sampling_ratio/min": 0.15322894011624158, + "sampling/sampling_logp_difference/max": 2.785897207260132, + "sampling/sampling_logp_difference/mean": 0.02024412974715233, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.040625, + "completions/max_length": 1985.0, + "completions/max_terminated_length": 1960.2, + "completions/mean_length": 982.38125, + "completions/mean_terminated_length": 922.0343994140625, + "completions/min_length": 458.2, + "completions/min_terminated_length": 458.2, + "entropy": 0.47347159385681153, + "epoch": 0.04700352526439483, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.9034355282783508, + "learning_rate": 1.5234375e-07, + "loss": -0.0147, + "num_tokens": 4929336.0, + "reward": 0.09375, + "reward_std": 0.13284323811531068, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.09375, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24181169271469116, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000688910484314, + "sampling/importance_sampling_ratio/min": 0.19303356036543845, + "sampling/sampling_logp_difference/max": 1.9485284805297851, + "sampling/sampling_logp_difference/mean": 0.02038279250264168, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021875, + "completions/max_length": 2342.0, + "completions/max_terminated_length": 2276.4, + "completions/mean_length": 1027.528125, + "completions/mean_terminated_length": 996.1073974609375, + "completions/min_length": 482.2, + "completions/min_terminated_length": 482.2, + "entropy": 0.5043671131134033, + "epoch": 0.052878965922444184, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.9663382172584534, + "learning_rate": 1.71875e-07, + "loss": 0.0037, + "num_tokens": 5559637.0, + "reward": 0.21223958134651183, + "reward_std": 0.16447981745004653, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.21223958134651183, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2990086942911148, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000964164733888, + "sampling/importance_sampling_ratio/min": 0.11464353739283979, + "sampling/sampling_logp_difference/max": 2.9539984464645386, + "sampling/sampling_logp_difference/mean": 0.021145598217844962, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2461.4, + "completions/max_terminated_length": 2225.4, + "completions/mean_length": 1047.071875, + "completions/mean_terminated_length": 1001.1214721679687, + "completions/min_length": 473.6, + "completions/min_terminated_length": 473.6, + "entropy": 0.48020014762878416, + "epoch": 0.05875440658049354, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.749260663986206, + "learning_rate": 1.9140625e-07, + "loss": 0.0315, + "num_tokens": 6190292.0, + "reward": 0.18177083432674407, + "reward_std": 0.20050898492336272, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.18177083432674407, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2954167366027832, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999913454055787, + "sampling/importance_sampling_ratio/min": 0.1635954909026623, + "sampling/sampling_logp_difference/max": 2.131788170337677, + "sampling/sampling_logp_difference/mean": 0.02059806026518345, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04375, + "completions/max_length": 2298.0, + "completions/max_terminated_length": 2198.2, + "completions/mean_length": 1082.275, + "completions/mean_terminated_length": 1020.4909301757813, + "completions/min_length": 407.6, + "completions/min_terminated_length": 407.6, + "entropy": 0.495795863866806, + "epoch": 0.06462984723854288, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.8722389936447144, + "learning_rate": 2.109375e-07, + "loss": -0.0005, + "num_tokens": 6832436.0, + "reward": 0.1869791716337204, + "reward_std": 0.17626949846744538, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.1869791716337204, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2800693780183792, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998305559158325, + "sampling/importance_sampling_ratio/min": 0.2545178957283497, + "sampling/sampling_logp_difference/max": 1.766669464111328, + "sampling/sampling_logp_difference/mean": 0.020665578171610834, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2152.2, + "completions/max_terminated_length": 2152.2, + "completions/mean_length": 995.3125, + "completions/mean_terminated_length": 995.3125, + "completions/min_length": 499.8, + "completions/min_terminated_length": 499.8, + "entropy": 0.4970084547996521, + "epoch": 0.07050528789659224, + "frac_reward_zero_std": 0.45, + "grad_norm": 1.0150976181030273, + "learning_rate": 2.3046875e-07, + "loss": 0.032, + "num_tokens": 7475432.0, + "reward": 0.19635416567325592, + "reward_std": 0.14557099491357803, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.19635416567325592, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32434697151184083, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000025498867035, + "sampling/importance_sampling_ratio/min": 0.2074445564299822, + "sampling/sampling_logp_difference/max": 1.8555801391601563, + "sampling/sampling_logp_difference/mean": 0.020710907503962518, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.034375, + "completions/max_length": 2296.0, + "completions/max_terminated_length": 2017.4, + "completions/mean_length": 1097.91875, + "completions/mean_terminated_length": 1054.652783203125, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "entropy": 0.4965181291103363, + "epoch": 0.07638072855464159, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.9392759799957275, + "learning_rate": 2.5e-07, + "loss": 0.0015, + "num_tokens": 8140146.0, + "reward": 0.1579166680574417, + "reward_std": 0.1368572235107422, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.1579166680574417, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25851217806339266, + "sampling/importance_sampling_ratio/max": 1.9590824365615844, + "sampling/importance_sampling_ratio/mean": 0.9998887419700623, + "sampling/importance_sampling_ratio/min": 0.2710499167442322, + "sampling/sampling_logp_difference/max": 1.3179824352264404, + "sampling/sampling_logp_difference/mean": 0.020697080716490745, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2351.6, + "completions/max_terminated_length": 2013.8, + "completions/mean_length": 972.6, + "completions/mean_terminated_length": 948.5700805664062, + "completions/min_length": 435.6, + "completions/min_terminated_length": 435.6, + "entropy": 0.5015838086605072, + "epoch": 0.08225616921269095, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.5987149477005005, + "learning_rate": 2.6953125e-07, + "loss": -0.0088, + "num_tokens": 8764046.0, + "reward": 0.15885416865348817, + "reward_std": 0.16067830175161363, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.15885416865348817, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22674326300621034, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000308990478515, + "sampling/importance_sampling_ratio/min": 0.18691894211806356, + "sampling/sampling_logp_difference/max": 2.474703884124756, + "sampling/sampling_logp_difference/mean": 0.020612315833568574, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2124.0, + "completions/max_terminated_length": 2083.6, + "completions/mean_length": 988.415625, + "completions/mean_terminated_length": 972.781884765625, + "completions/min_length": 492.4, + "completions/min_terminated_length": 492.4, + "entropy": 0.48552640676498415, + "epoch": 0.0881316098707403, + "frac_reward_zero_std": 0.1, + "grad_norm": 1.33731210231781, + "learning_rate": 2.890625e-07, + "loss": 0.0099, + "num_tokens": 9408467.0, + "reward": 0.24713541865348815, + "reward_std": 0.2190377503633499, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.24713541865348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.306401264667511, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999894917011261, + "sampling/importance_sampling_ratio/min": 0.18375444859266282, + "sampling/sampling_logp_difference/max": 1.751639199256897, + "sampling/sampling_logp_difference/mean": 0.020667668804526328, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.028125, + "completions/max_length": 2400.6, + "completions/max_terminated_length": 2171.8, + "completions/mean_length": 990.3125, + "completions/mean_terminated_length": 943.2689819335938, + "completions/min_length": 443.4, + "completions/min_terminated_length": 443.4, + "entropy": 0.49701812863349915, + "epoch": 0.09400705052878966, + "frac_reward_zero_std": 0.3, + "grad_norm": 1.343873143196106, + "learning_rate": 3.0859375e-07, + "loss": -0.0044, + "num_tokens": 10033683.0, + "reward": 0.17656249850988387, + "reward_std": 0.16451094299554825, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.17656250447034835, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2665316700935364, + "sampling/importance_sampling_ratio/max": 1.988339638710022, + "sampling/importance_sampling_ratio/mean": 0.9998800039291382, + "sampling/importance_sampling_ratio/min": 0.18474510461091995, + "sampling/sampling_logp_difference/max": 1.813150119781494, + "sampling/sampling_logp_difference/mean": 0.02064768560230732, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2283.6, + "completions/max_terminated_length": 1947.8, + "completions/mean_length": 964.471875, + "completions/mean_terminated_length": 916.251416015625, + "completions/min_length": 444.8, + "completions/min_terminated_length": 444.8, + "entropy": 0.47441959381103516, + "epoch": 0.099882491186839, + "frac_reward_zero_std": 0.3, + "grad_norm": 1.2677315473556519, + "learning_rate": 3.28125e-07, + "loss": -0.0342, + "num_tokens": 10654690.0, + "reward": 0.16875, + "reward_std": 0.16111062318086625, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.16875000149011612, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25565315783023834, + "sampling/importance_sampling_ratio/max": 1.969423198699951, + "sampling/importance_sampling_ratio/mean": 1.000089454650879, + "sampling/importance_sampling_ratio/min": 0.14717500358819963, + "sampling/sampling_logp_difference/max": 2.103808379173279, + "sampling/sampling_logp_difference/mean": 0.02052120789885521, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2336.6, + "completions/max_terminated_length": 2279.0, + "completions/mean_length": 1002.2875, + "completions/mean_terminated_length": 984.4052978515625, + "completions/min_length": 415.4, + "completions/min_terminated_length": 415.4, + "entropy": 0.4757814884185791, + "epoch": 0.10575793184488837, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.9901517033576965, + "learning_rate": 3.4765625e-07, + "loss": -0.0046, + "num_tokens": 11290958.0, + "reward": 0.2846354186534882, + "reward_std": 0.27473083734512327, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.2846354186534882, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3511134922504425, + "sampling/importance_sampling_ratio/max": 1.9479135990142822, + "sampling/importance_sampling_ratio/mean": 0.9998469710350036, + "sampling/importance_sampling_ratio/min": 0.2177226183936, + "sampling/sampling_logp_difference/max": 1.9894657850265502, + "sampling/sampling_logp_difference/mean": 0.019795811921358108, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 2199.2, + "completions/max_terminated_length": 2174.0, + "completions/mean_length": 993.784375, + "completions/mean_terminated_length": 981.0612548828125, + "completions/min_length": 410.8, + "completions/min_terminated_length": 410.8, + "entropy": 0.5097217082977294, + "epoch": 0.11163337250293771, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.1467088460922241, + "learning_rate": 3.671875e-07, + "loss": 0.0162, + "num_tokens": 11907597.0, + "reward": 0.28682292252779007, + "reward_std": 0.24061587154865266, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.28682292252779007, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3374872386455536, + "sampling/importance_sampling_ratio/max": 1.9729577779769898, + "sampling/importance_sampling_ratio/mean": 1.0000439643859864, + "sampling/importance_sampling_ratio/min": 0.2012358859181404, + "sampling/sampling_logp_difference/max": 1.7005351781845093, + "sampling/sampling_logp_difference/mean": 0.02079613581299782, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2030.2, + "completions/max_terminated_length": 1979.8, + "completions/mean_length": 920.021875, + "completions/mean_terminated_length": 902.3637084960938, + "completions/min_length": 424.6, + "completions/min_terminated_length": 424.6, + "entropy": 0.47220299243927, + "epoch": 0.11750881316098707, + "frac_reward_zero_std": 0.1, + "grad_norm": 1.0923038721084595, + "learning_rate": 3.8671875e-07, + "loss": -0.0021, + "num_tokens": 12509988.0, + "reward": 0.3457812681794167, + "reward_std": 0.2333272099494934, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3457812681794167, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31239897608757017, + "sampling/importance_sampling_ratio/max": 1.989612627029419, + "sampling/importance_sampling_ratio/mean": 0.9997779369354248, + "sampling/importance_sampling_ratio/min": 0.2512615159153938, + "sampling/sampling_logp_difference/max": 1.8858316898345948, + "sampling/sampling_logp_difference/mean": 0.02036282978951931, + "step": 100 + }, + { + "epoch": 0.11750881316098707, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.00875, + "eval_completions/max_length": 2168.76, + "eval_completions/max_terminated_length": 2077.44, + "eval_completions/mean_length": 973.209375, + "eval_completions/mean_terminated_length": 960.3953955078125, + "eval_completions/min_length": 450.52, + "eval_completions/min_terminated_length": 450.52, + "eval_entropy": 0.4852174758911133, + "eval_frac_reward_zero_std": 0.28, + "eval_loss": 0.009060491807758808, + "eval_num_tokens": 12509988.0, + "eval_reward": 0.3109791725873947, + "eval_reward_std": 0.18228219971060752, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.31097917556762694, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.317981595993042, + "eval_runtime": 666.9163, + "eval_samples_per_second": 0.15, + "eval_sampling/importance_sampling_ratio/max": 1.9854744291305542, + "eval_sampling/importance_sampling_ratio/mean": 0.9999976515769958, + "eval_sampling/importance_sampling_ratio/min": 0.2449254010617733, + "eval_sampling/sampling_logp_difference/max": 1.6110917520523071, + "eval_sampling/sampling_logp_difference/mean": 0.020074035078287124, + "eval_steps_per_second": 0.003, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 2408.8, + "completions/max_terminated_length": 2266.6, + "completions/mean_length": 1018.646875, + "completions/mean_terminated_length": 981.5686157226562, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.48829834461212157, + "epoch": 0.12338425381903642, + "frac_reward_zero_std": 0.15, + "grad_norm": 1.136791706085205, + "learning_rate": 4.0625e-07, + "loss": 0.0196, + "num_tokens": 13133331.0, + "reward": 0.32442708015441896, + "reward_std": 0.2731425791978836, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3244270861148834, + "rewards/e2e_recall_precision_mixed_reward/std": 0.39561856985092164, + "sampling/importance_sampling_ratio/max": 1.969839334487915, + "sampling/importance_sampling_ratio/mean": 1.00008225440979, + "sampling/importance_sampling_ratio/min": 0.27382618486881255, + "sampling/sampling_logp_difference/max": 1.4294708490371704, + "sampling/sampling_logp_difference/mean": 0.020213060453534125, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2152.0, + "completions/max_terminated_length": 2136.8, + "completions/mean_length": 971.184375, + "completions/mean_terminated_length": 962.189306640625, + "completions/min_length": 476.8, + "completions/min_terminated_length": 476.8, + "entropy": 0.4925812900066376, + "epoch": 0.12925969447708577, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.1485499143600464, + "learning_rate": 4.2578124999999997e-07, + "loss": -0.0082, + "num_tokens": 13765366.0, + "reward": 0.3632812604308128, + "reward_std": 0.2154562935233116, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3632812604308128, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3520397037267685, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999819278717041, + "sampling/importance_sampling_ratio/min": 0.2579982398077846, + "sampling/sampling_logp_difference/max": 1.8680977821350098, + "sampling/sampling_logp_difference/mean": 0.0207187470048666, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2001.8, + "completions/max_terminated_length": 1935.0, + "completions/mean_length": 925.90625, + "completions/mean_terminated_length": 920.6155395507812, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "entropy": 0.4797549247741699, + "epoch": 0.13513513513513514, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7465731501579285, + "learning_rate": 4.4531249999999997e-07, + "loss": -0.0196, + "num_tokens": 14365572.0, + "reward": 0.4515625059604645, + "reward_std": 0.1962999165058136, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4515625059604645, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36195426285266874, + "sampling/importance_sampling_ratio/max": 1.9571106672286986, + "sampling/importance_sampling_ratio/mean": 0.9999866843223572, + "sampling/importance_sampling_ratio/min": 0.2458883583545685, + "sampling/sampling_logp_difference/max": 1.5592980146408082, + "sampling/sampling_logp_difference/mean": 0.02007099725306034, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2122.8, + "completions/max_terminated_length": 1935.6, + "completions/mean_length": 1038.65, + "completions/mean_terminated_length": 1022.6489013671875, + "completions/min_length": 478.6, + "completions/min_terminated_length": 478.6, + "entropy": 0.4651613235473633, + "epoch": 0.1410105757931845, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8895966410636902, + "learning_rate": 4.6484374999999997e-07, + "loss": -0.0124, + "num_tokens": 15020500.0, + "reward": 0.3198958396911621, + "reward_std": 0.20608305782079697, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3198958396911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29388985931873324, + "sampling/importance_sampling_ratio/max": 1.958699345588684, + "sampling/importance_sampling_ratio/mean": 1.0000421762466432, + "sampling/importance_sampling_ratio/min": 0.26278833746910096, + "sampling/sampling_logp_difference/max": 1.4563037395477294, + "sampling/sampling_logp_difference/mean": 0.019570792466402052, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.059375, + "completions/max_length": 2256.8, + "completions/max_terminated_length": 2058.4, + "completions/mean_length": 1109.3, + "completions/mean_terminated_length": 1020.2399169921875, + "completions/min_length": 480.6, + "completions/min_terminated_length": 480.6, + "entropy": 0.4841072797775269, + "epoch": 0.14688601645123384, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8954580426216125, + "learning_rate": 4.84375e-07, + "loss": -0.0518, + "num_tokens": 15635352.0, + "reward": 0.3098958432674408, + "reward_std": 0.16147686839103698, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3098958432674408, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3426029205322266, + "sampling/importance_sampling_ratio/max": 1.9685364723205567, + "sampling/importance_sampling_ratio/mean": 1.0001961946487428, + "sampling/importance_sampling_ratio/min": 0.3116583779454231, + "sampling/sampling_logp_difference/max": 1.2820564270019532, + "sampling/sampling_logp_difference/mean": 0.020258011296391487, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2127.8, + "completions/max_terminated_length": 2040.8, + "completions/mean_length": 1004.88125, + "completions/mean_terminated_length": 1000.2262573242188, + "completions/min_length": 502.6, + "completions/min_terminated_length": 502.6, + "entropy": 0.4659832537174225, + "epoch": 0.15276145710928318, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.921988308429718, + "learning_rate": 4.998788466198207e-07, + "loss": -0.0002, + "num_tokens": 16296942.0, + "reward": 0.36302084624767306, + "reward_std": 0.19413625001907348, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.36302084624767306, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3169731110334396, + "sampling/importance_sampling_ratio/max": 1.9976950645446778, + "sampling/importance_sampling_ratio/mean": 1.000005567073822, + "sampling/importance_sampling_ratio/min": 0.2085201695561409, + "sampling/sampling_logp_difference/max": 1.7547591209411622, + "sampling/sampling_logp_difference/mean": 0.019374296069145203, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021875, + "completions/max_length": 2315.2, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1105.909375, + "completions/mean_terminated_length": 1075.3181396484374, + "completions/min_length": 484.6, + "completions/min_terminated_length": 484.6, + "entropy": 0.4691031098365784, + "epoch": 0.15863689776733256, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.7131484150886536, + "learning_rate": 4.992730797189241e-07, + "loss": -0.0383, + "num_tokens": 16982229.0, + "reward": 0.2643229216337204, + "reward_std": 0.1752532333135605, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.2643229216337204, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2676455333828926, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999748706817627, + "sampling/importance_sampling_ratio/min": 0.24143882989883422, + "sampling/sampling_logp_difference/max": 1.5030804395675659, + "sampling/sampling_logp_difference/mean": 0.019500917568802834, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.028125, + "completions/max_length": 2466.6, + "completions/max_terminated_length": 2179.8, + "completions/mean_length": 1064.678125, + "completions/mean_terminated_length": 1018.6744384765625, + "completions/min_length": 527.2, + "completions/min_terminated_length": 527.2, + "entropy": 0.47563568949699403, + "epoch": 0.1645123384253819, + "frac_reward_zero_std": 0.15, + "grad_norm": 1.0195895433425903, + "learning_rate": 4.986673128180276e-07, + "loss": 0.0268, + "num_tokens": 17612586.0, + "reward": 0.3347395837306976, + "reward_std": 0.2197277307510376, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3347395837306976, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31412242650985717, + "sampling/importance_sampling_ratio/max": 1.9938777923583983, + "sampling/importance_sampling_ratio/mean": 0.9999652743339539, + "sampling/importance_sampling_ratio/min": 0.09777447709363969, + "sampling/sampling_logp_difference/max": 4.446249318122864, + "sampling/sampling_logp_difference/mean": 0.01978233680129051, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2201.0, + "completions/max_terminated_length": 2164.0, + "completions/mean_length": 1086.365625, + "completions/mean_terminated_length": 1063.9522705078125, + "completions/min_length": 471.8, + "completions/min_terminated_length": 471.8, + "entropy": 0.46193733215332033, + "epoch": 0.17038777908343125, + "frac_reward_zero_std": 0.15, + "grad_norm": 1.0625224113464355, + "learning_rate": 4.980615459171311e-07, + "loss": 0.0038, + "num_tokens": 18280347.0, + "reward": 0.47427083253860475, + "reward_std": 0.22851565778255462, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.47427083253860475, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3152575194835663, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000032937526703, + "sampling/importance_sampling_ratio/min": 0.23960502099653241, + "sampling/sampling_logp_difference/max": 2.9025020360946656, + "sampling/sampling_logp_difference/mean": 0.019664775207638742, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 2436.0, + "completions/max_terminated_length": 2295.6, + "completions/mean_length": 1323.515625, + "completions/mean_terminated_length": 1297.6546875, + "completions/min_length": 648.4, + "completions/min_terminated_length": 648.4, + "entropy": 0.44961647391319276, + "epoch": 0.1762632197414806, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.8402144312858582, + "learning_rate": 4.974557790162345e-07, + "loss": 0.0206, + "num_tokens": 19023648.0, + "reward": 0.40973958671092986, + "reward_std": 0.242001411318779, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.40973958671092986, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30689987242221833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000560522079467, + "sampling/importance_sampling_ratio/min": 0.21943920934572816, + "sampling/sampling_logp_difference/max": 2.0875226736068724, + "sampling/sampling_logp_difference/mean": 0.018646536394953728, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 2189.8, + "completions/max_terminated_length": 2100.0, + "completions/mean_length": 1232.584375, + "completions/mean_terminated_length": 1221.488134765625, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "entropy": 0.4616339147090912, + "epoch": 0.18213866039952997, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8758718967437744, + "learning_rate": 4.96850012115338e-07, + "loss": -0.0048, + "num_tokens": 19727359.0, + "reward": 0.3982812583446503, + "reward_std": 0.23960140347480774, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3982812583446503, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3587407112121582, + "sampling/importance_sampling_ratio/max": 1.988889455795288, + "sampling/importance_sampling_ratio/mean": 1.0001670956611632, + "sampling/importance_sampling_ratio/min": 0.3248328477144241, + "sampling/sampling_logp_difference/max": 1.1651965141296388, + "sampling/sampling_logp_difference/mean": 0.01926850378513336, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2299.6, + "completions/max_terminated_length": 2056.8, + "completions/mean_length": 1074.50625, + "completions/mean_terminated_length": 1053.7550659179688, + "completions/min_length": 482.2, + "completions/min_terminated_length": 482.2, + "entropy": 0.44536136984825136, + "epoch": 0.18801410105757932, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8042799830436707, + "learning_rate": 4.962442452144414e-07, + "loss": 0.0248, + "num_tokens": 20380797.0, + "reward": 0.4602083504199982, + "reward_std": 0.21384959816932678, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4602083384990692, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3380684912204742, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000497698783875, + "sampling/importance_sampling_ratio/min": 0.24892064929008484, + "sampling/sampling_logp_difference/max": 1.4376299142837525, + "sampling/sampling_logp_difference/mean": 0.019230544194579126, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.028125, + "completions/max_length": 2365.4, + "completions/max_terminated_length": 2204.2, + "completions/mean_length": 1309.959375, + "completions/mean_terminated_length": 1276.075146484375, + "completions/min_length": 693.6, + "completions/min_terminated_length": 693.6, + "entropy": 0.45925586819648745, + "epoch": 0.19388954171562867, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.8627838492393494, + "learning_rate": 4.956384783135449e-07, + "loss": 0.0015, + "num_tokens": 21111084.0, + "reward": 0.4284895807504654, + "reward_std": 0.24629194140434266, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4284895807504654, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32105106115341187, + "sampling/importance_sampling_ratio/max": 1.9395444869995118, + "sampling/importance_sampling_ratio/mean": 0.9999743103981018, + "sampling/importance_sampling_ratio/min": 0.3220216006040573, + "sampling/sampling_logp_difference/max": 1.197959566116333, + "sampling/sampling_logp_difference/mean": 0.018676093593239785, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.071875, + "completions/max_length": 2413.2, + "completions/max_terminated_length": 2344.0, + "completions/mean_length": 1377.446875, + "completions/mean_terminated_length": 1291.2495361328124, + "completions/min_length": 661.8, + "completions/min_terminated_length": 661.8, + "entropy": 0.44391797184944154, + "epoch": 0.199764982373678, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.5744222402572632, + "learning_rate": 4.950327114126484e-07, + "loss": -0.0102, + "num_tokens": 21817791.0, + "reward": 0.37614584267139434, + "reward_std": 0.18710350692272187, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.37614584267139434, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3078536331653595, + "sampling/importance_sampling_ratio/max": 1.932727074623108, + "sampling/importance_sampling_ratio/mean": 0.9998797655105591, + "sampling/importance_sampling_ratio/min": 0.17999765202403067, + "sampling/sampling_logp_difference/max": 2.0344920635223387, + "sampling/sampling_logp_difference/mean": 0.018212152644991875, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.053125, + "completions/max_length": 2500.0, + "completions/max_terminated_length": 2317.2, + "completions/mean_length": 1387.946875, + "completions/mean_terminated_length": 1327.648046875, + "completions/min_length": 684.8, + "completions/min_terminated_length": 684.8, + "entropy": 0.4456583082675934, + "epoch": 0.2056404230317274, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.0140538215637207, + "learning_rate": 4.944269445117519e-07, + "loss": -0.0183, + "num_tokens": 22518842.0, + "reward": 0.3370312511920929, + "reward_std": 0.19164448380470275, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.337031252682209, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29072641432285307, + "sampling/importance_sampling_ratio/max": 1.9975413084030151, + "sampling/importance_sampling_ratio/mean": 1.0000341892242433, + "sampling/importance_sampling_ratio/min": 0.31241180300712584, + "sampling/sampling_logp_difference/max": 1.204920768737793, + "sampling/sampling_logp_difference/mean": 0.01850493885576725, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021875, + "completions/max_length": 2447.2, + "completions/max_terminated_length": 2257.8, + "completions/mean_length": 1337.359375, + "completions/mean_terminated_length": 1312.4825439453125, + "completions/min_length": 656.2, + "completions/min_terminated_length": 656.2, + "entropy": 0.45110672116279604, + "epoch": 0.21151586368977673, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.9597908854484558, + "learning_rate": 4.938211776108554e-07, + "loss": -0.0207, + "num_tokens": 23240145.0, + "reward": 0.4163541615009308, + "reward_std": 0.21045998930931092, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4163541793823242, + "rewards/e2e_recall_precision_mixed_reward/std": 0.289658859372139, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000091910362243, + "sampling/importance_sampling_ratio/min": 0.2906631052494049, + "sampling/sampling_logp_difference/max": 1.337960433959961, + "sampling/sampling_logp_difference/mean": 0.018258562311530113, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2301.8, + "completions/max_terminated_length": 2122.8, + "completions/mean_length": 1253.5125, + "completions/mean_terminated_length": 1237.3944091796875, + "completions/min_length": 634.2, + "completions/min_terminated_length": 634.2, + "entropy": 0.43949020504951475, + "epoch": 0.21739130434782608, + "frac_reward_zero_std": 0.15, + "grad_norm": 1.036664366722107, + "learning_rate": 4.932154107099588e-07, + "loss": -0.0049, + "num_tokens": 23945637.0, + "reward": 0.3103645980358124, + "reward_std": 0.22137612998485565, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3103645980358124, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3476561546325684, + "sampling/importance_sampling_ratio/max": 1.9506564617156983, + "sampling/importance_sampling_ratio/mean": 0.9999400854110718, + "sampling/importance_sampling_ratio/min": 0.23950822800397872, + "sampling/sampling_logp_difference/max": 1.6213460922241212, + "sampling/sampling_logp_difference/mean": 0.018114538118243217, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 2325.6, + "completions/max_terminated_length": 2258.8, + "completions/mean_length": 1329.1875, + "completions/mean_terminated_length": 1308.8904541015625, + "completions/min_length": 696.0, + "completions/min_terminated_length": 696.0, + "entropy": 0.42851370573043823, + "epoch": 0.22326674500587543, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.8615764379501343, + "learning_rate": 4.926096438090623e-07, + "loss": -0.0222, + "num_tokens": 24671129.0, + "reward": 0.45187500715255735, + "reward_std": 0.19458201229572297, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.45187500715255735, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32191779315471647, + "sampling/importance_sampling_ratio/max": 1.9079566240310668, + "sampling/importance_sampling_ratio/mean": 1.0002107262611388, + "sampling/importance_sampling_ratio/min": 0.2823118090629578, + "sampling/sampling_logp_difference/max": 1.2879379272460938, + "sampling/sampling_logp_difference/mean": 0.01770188324153423, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2432.4, + "completions/max_terminated_length": 2339.2, + "completions/mean_length": 1379.309375, + "completions/mean_terminated_length": 1366.236669921875, + "completions/min_length": 681.4, + "completions/min_terminated_length": 681.4, + "entropy": 0.43162922859191893, + "epoch": 0.2291421856639248, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.5466075539588928, + "learning_rate": 4.920038769081657e-07, + "loss": 0.0155, + "num_tokens": 25448412.0, + "reward": 0.3679687574505806, + "reward_std": 0.1814160704612732, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3679687574505806, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26374678313732147, + "sampling/importance_sampling_ratio/max": 1.9999159097671508, + "sampling/importance_sampling_ratio/mean": 1.0000950813293457, + "sampling/importance_sampling_ratio/min": 0.2659080035984516, + "sampling/sampling_logp_difference/max": 1.629682731628418, + "sampling/sampling_logp_difference/mean": 0.017587186023592948, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 2394.8, + "completions/max_terminated_length": 2268.2, + "completions/mean_length": 1396.196875, + "completions/mean_terminated_length": 1368.7766845703125, + "completions/min_length": 593.6, + "completions/min_terminated_length": 593.6, + "entropy": 0.45481058955192566, + "epoch": 0.23501762632197415, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.5931362509727478, + "learning_rate": 4.913981100072691e-07, + "loss": -0.0407, + "num_tokens": 26210379.0, + "reward": 0.4175000131130219, + "reward_std": 0.23002639412879944, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4175000131130219, + "rewards/e2e_recall_precision_mixed_reward/std": 0.33679488897323606, + "sampling/importance_sampling_ratio/max": 1.951190209388733, + "sampling/importance_sampling_ratio/mean": 0.9999919533729553, + "sampling/importance_sampling_ratio/min": 0.2368324212729931, + "sampling/sampling_logp_difference/max": 1.608810019493103, + "sampling/sampling_logp_difference/mean": 0.01873408742249012, + "step": 200 + }, + { + "epoch": 0.23501762632197415, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.014375, + "eval_completions/max_length": 2369.72, + "eval_completions/max_terminated_length": 2250.0, + "eval_completions/mean_length": 1299.76125, + "eval_completions/mean_terminated_length": 1283.172724609375, + "eval_completions/min_length": 709.2, + "eval_completions/min_terminated_length": 709.2, + "eval_entropy": 0.42435838222503663, + "eval_frac_reward_zero_std": 0.14, + "eval_loss": 0.002923845313489437, + "eval_num_tokens": 26210379.0, + "eval_reward": 0.422958345413208, + "eval_reward_std": 0.1989289104938507, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.4229583466053009, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31139856338500976, + "eval_runtime": 603.448, + "eval_samples_per_second": 0.166, + "eval_sampling/importance_sampling_ratio/max": 1.9720039033889771, + "eval_sampling/importance_sampling_ratio/mean": 1.0000151467323304, + "eval_sampling/importance_sampling_ratio/min": 0.2624286452680826, + "eval_sampling/sampling_logp_difference/max": 1.560486044883728, + "eval_sampling/sampling_logp_difference/mean": 0.017812692523002625, + "eval_steps_per_second": 0.003, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2368.2, + "completions/max_terminated_length": 2345.2, + "completions/mean_length": 1356.390625, + "completions/mean_terminated_length": 1348.66416015625, + "completions/min_length": 700.8, + "completions/min_terminated_length": 700.8, + "entropy": 0.42209169268608093, + "epoch": 0.2408930669800235, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.8277153372764587, + "learning_rate": 4.907923431063726e-07, + "loss": 0.0107, + "num_tokens": 26975488.0, + "reward": 0.45453126430511476, + "reward_std": 0.20152193903923035, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.45453126430511476, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3548352122306824, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000168442726136, + "sampling/importance_sampling_ratio/min": 0.2511422336101532, + "sampling/sampling_logp_difference/max": 1.4191609382629395, + "sampling/sampling_logp_difference/mean": 0.017601443454623222, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 2207.8, + "completions/max_terminated_length": 2191.8, + "completions/mean_length": 1347.0625, + "completions/mean_terminated_length": 1314.820458984375, + "completions/min_length": 608.6, + "completions/min_terminated_length": 608.6, + "entropy": 0.43602086901664733, + "epoch": 0.24676850763807284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.9807378649711609, + "learning_rate": 4.901865762054761e-07, + "loss": -0.0325, + "num_tokens": 27692204.0, + "reward": 0.40880208611488345, + "reward_std": 0.248151096701622, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4088020920753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30966946482658386, + "sampling/importance_sampling_ratio/max": 1.934563159942627, + "sampling/importance_sampling_ratio/mean": 0.9999630689620972, + "sampling/importance_sampling_ratio/min": 0.16581638418138028, + "sampling/sampling_logp_difference/max": 2.415492820739746, + "sampling/sampling_logp_difference/mean": 0.01788683459162712, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 2410.6, + "completions/max_terminated_length": 2362.2, + "completions/mean_length": 1393.41875, + "completions/mean_terminated_length": 1373.951611328125, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.4307835817337036, + "epoch": 0.2526439482961222, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.9995214343070984, + "learning_rate": 4.895808093045796e-07, + "loss": 0.0368, + "num_tokens": 28450426.0, + "reward": 0.46427084505558014, + "reward_std": 0.21769410669803618, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.46427084505558014, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34339686036109923, + "sampling/importance_sampling_ratio/max": 1.9890089750289917, + "sampling/importance_sampling_ratio/mean": 0.9999396681785584, + "sampling/importance_sampling_ratio/min": 0.16661263704299928, + "sampling/sampling_logp_difference/max": 1.9397084712982178, + "sampling/sampling_logp_difference/mean": 0.01795981228351593, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2269.0, + "completions/max_terminated_length": 2211.8, + "completions/mean_length": 1407.796875, + "completions/mean_terminated_length": 1390.346142578125, + "completions/min_length": 765.2, + "completions/min_terminated_length": 765.2, + "entropy": 0.42940289378166197, + "epoch": 0.25851938895417154, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.9037354588508606, + "learning_rate": 4.889750424036831e-07, + "loss": -0.0166, + "num_tokens": 29224677.0, + "reward": 0.4435416698455811, + "reward_std": 0.17664896845817565, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4435416698455811, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29355489313602445, + "sampling/importance_sampling_ratio/max": 1.9557361602783203, + "sampling/importance_sampling_ratio/mean": 0.9999936103820801, + "sampling/importance_sampling_ratio/min": 0.30081471651792524, + "sampling/sampling_logp_difference/max": 1.4767087697982788, + "sampling/sampling_logp_difference/mean": 0.018042086437344552, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.028125, + "completions/max_length": 2500.0, + "completions/max_terminated_length": 2453.4, + "completions/mean_length": 1509.70625, + "completions/mean_terminated_length": 1481.46875, + "completions/min_length": 920.2, + "completions/min_terminated_length": 920.2, + "entropy": 0.4266699433326721, + "epoch": 0.26439482961222094, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.4904046356678009, + "learning_rate": 4.883692755027865e-07, + "loss": -0.0152, + "num_tokens": 29973283.0, + "reward": 0.3910937547683716, + "reward_std": 0.1875857561826706, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3910937488079071, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3184710621833801, + "sampling/importance_sampling_ratio/max": 1.9934840202331543, + "sampling/importance_sampling_ratio/mean": 0.9998901844024658, + "sampling/importance_sampling_ratio/min": 0.30794100314378736, + "sampling/sampling_logp_difference/max": 1.3923618793487549, + "sampling/sampling_logp_difference/mean": 0.017345474287867545, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2255.6, + "completions/max_terminated_length": 2215.8, + "completions/mean_length": 1363.684375, + "completions/mean_terminated_length": 1348.9182373046874, + "completions/min_length": 705.0, + "completions/min_terminated_length": 705.0, + "entropy": 0.4166957139968872, + "epoch": 0.2702702702702703, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8560689687728882, + "learning_rate": 4.8776350860189e-07, + "loss": -0.013, + "num_tokens": 30756366.0, + "reward": 0.5055729269981384, + "reward_std": 0.17977026402950286, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.505572932958603, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25386624932289126, + "sampling/importance_sampling_ratio/max": 1.954643964767456, + "sampling/importance_sampling_ratio/mean": 1.000126600265503, + "sampling/importance_sampling_ratio/min": 0.35047273635864257, + "sampling/sampling_logp_difference/max": 1.0577001810073852, + "sampling/sampling_logp_difference/mean": 0.017576563358306884, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021875, + "completions/max_length": 2500.0, + "completions/max_terminated_length": 2310.6, + "completions/mean_length": 1463.453125, + "completions/mean_terminated_length": 1441.3449951171874, + "completions/min_length": 834.8, + "completions/min_terminated_length": 834.8, + "entropy": 0.4108987033367157, + "epoch": 0.27614571092831963, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.7477687001228333, + "learning_rate": 4.871577417009934e-07, + "loss": -0.0453, + "num_tokens": 31537571.0, + "reward": 0.43791667819023133, + "reward_std": 0.205858114361763, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.43791667819023133, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29302410781383514, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999675035476685, + "sampling/importance_sampling_ratio/min": 0.174767720699463, + "sampling/sampling_logp_difference/max": 6.906278848648071, + "sampling/sampling_logp_difference/mean": 0.01736109107732773, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.028125, + "completions/max_length": 2289.8, + "completions/max_terminated_length": 2215.6, + "completions/mean_length": 1420.634375, + "completions/mean_terminated_length": 1395.6822998046875, + "completions/min_length": 808.0, + "completions/min_terminated_length": 808.0, + "entropy": 0.4105457663536072, + "epoch": 0.282021151586369, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.8343913555145264, + "learning_rate": 4.865519748000969e-07, + "loss": -0.0394, + "num_tokens": 32276378.0, + "reward": 0.48182291984558107, + "reward_std": 0.21514118313789368, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.48182291984558107, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29770660400390625, + "sampling/importance_sampling_ratio/max": 1.9636056900024415, + "sampling/importance_sampling_ratio/mean": 0.9999726414680481, + "sampling/importance_sampling_ratio/min": 0.3000651866197586, + "sampling/sampling_logp_difference/max": 1.286457371711731, + "sampling/sampling_logp_difference/mean": 0.017064289748668672, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2353.2, + "completions/max_terminated_length": 2159.0, + "completions/mean_length": 1367.946875, + "completions/mean_terminated_length": 1353.3779052734376, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.3979348599910736, + "epoch": 0.2878965922444183, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.831149160861969, + "learning_rate": 4.859462078992004e-07, + "loss": -0.0077, + "num_tokens": 33040329.0, + "reward": 0.4345312714576721, + "reward_std": 0.2075097978115082, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4345312714576721, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29992562234401704, + "sampling/importance_sampling_ratio/max": 1.947805905342102, + "sampling/importance_sampling_ratio/mean": 0.9999383926391602, + "sampling/importance_sampling_ratio/min": 0.2707966983318329, + "sampling/sampling_logp_difference/max": 1.6397278547286986, + "sampling/sampling_logp_difference/mean": 0.016731590032577515, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2313.4, + "completions/max_terminated_length": 2260.0, + "completions/mean_length": 1422.896875, + "completions/mean_terminated_length": 1419.680224609375, + "completions/min_length": 828.2, + "completions/min_terminated_length": 828.2, + "entropy": 0.40560716986656187, + "epoch": 0.2937720329024677, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8927066922187805, + "learning_rate": 4.853404409983038e-07, + "loss": 0.0072, + "num_tokens": 33791348.0, + "reward": 0.5130208373069763, + "reward_std": 0.1809627652168274, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5130208373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2381391167640686, + "sampling/importance_sampling_ratio/max": 1.878486657142639, + "sampling/importance_sampling_ratio/mean": 1.0000173091888427, + "sampling/importance_sampling_ratio/min": 0.16526238694787027, + "sampling/sampling_logp_difference/max": 2.048710656166077, + "sampling/sampling_logp_difference/mean": 0.016782762855291365, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021875, + "completions/max_length": 2481.2, + "completions/max_terminated_length": 2422.4, + "completions/mean_length": 1547.384375, + "completions/mean_terminated_length": 1526.5686279296874, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.423099547624588, + "epoch": 0.299647473560517, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.771557092666626, + "learning_rate": 4.847346740974073e-07, + "loss": -0.0087, + "num_tokens": 34596563.0, + "reward": 0.46609375476837156, + "reward_std": 0.22107117474079133, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.46609376072883607, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30911803245544434, + "sampling/importance_sampling_ratio/max": 1.9809560775756836, + "sampling/importance_sampling_ratio/mean": 1.0000070095062257, + "sampling/importance_sampling_ratio/min": 0.26680448576807975, + "sampling/sampling_logp_difference/max": 1.5669462442398072, + "sampling/sampling_logp_difference/mean": 0.017237287014722824, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 2367.2, + "completions/max_terminated_length": 2233.0, + "completions/mean_length": 1370.38125, + "completions/mean_terminated_length": 1349.311376953125, + "completions/min_length": 832.6, + "completions/min_terminated_length": 832.6, + "entropy": 0.4151655673980713, + "epoch": 0.30552291421856637, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.8248306512832642, + "learning_rate": 4.841289071965108e-07, + "loss": 0.0028, + "num_tokens": 35339573.0, + "reward": 0.35020835101604464, + "reward_std": 0.1863805890083313, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.35020834505558013, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3319753110408783, + "sampling/importance_sampling_ratio/max": 1.9679210186004639, + "sampling/importance_sampling_ratio/mean": 0.9999267935752869, + "sampling/importance_sampling_ratio/min": 0.3026794917881489, + "sampling/sampling_logp_difference/max": 1.417945671081543, + "sampling/sampling_logp_difference/mean": 0.017283813282847404, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06875, + "completions/max_length": 2469.0, + "completions/max_terminated_length": 2381.2, + "completions/mean_length": 1549.65, + "completions/mean_terminated_length": 1483.4290283203125, + "completions/min_length": 774.2, + "completions/min_terminated_length": 774.2, + "entropy": 0.42785446643829345, + "epoch": 0.31139835487661577, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.6696219444274902, + "learning_rate": 4.835231402956143e-07, + "loss": -0.0622, + "num_tokens": 36133117.0, + "reward": 0.4380729258060455, + "reward_std": 0.207410229742527, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4380729258060455, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3342203199863434, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000066590309143, + "sampling/importance_sampling_ratio/min": 0.26250506937503815, + "sampling/sampling_logp_difference/max": 1.4055633783340453, + "sampling/sampling_logp_difference/mean": 0.017540974915027617, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2450.4, + "completions/max_terminated_length": 2289.0, + "completions/mean_length": 1445.24375, + "completions/mean_terminated_length": 1428.3465087890625, + "completions/min_length": 873.8, + "completions/min_terminated_length": 873.8, + "entropy": 0.42337934374809266, + "epoch": 0.3172737955346651, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.6972928643226624, + "learning_rate": 4.829173733947177e-07, + "loss": -0.0112, + "num_tokens": 36890567.0, + "reward": 0.4680208325386047, + "reward_std": 0.20582786798477173, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4680208325386047, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2813736617565155, + "sampling/importance_sampling_ratio/max": 1.9894936561584473, + "sampling/importance_sampling_ratio/mean": 1.0000147461891173, + "sampling/importance_sampling_ratio/min": 0.2970141440629959, + "sampling/sampling_logp_difference/max": 1.4801963686943054, + "sampling/sampling_logp_difference/mean": 0.01727181263267994, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2370.0, + "completions/max_terminated_length": 2325.6, + "completions/mean_length": 1453.1125, + "completions/mean_terminated_length": 1439.437353515625, + "completions/min_length": 859.2, + "completions/min_terminated_length": 859.2, + "entropy": 0.43045341968536377, + "epoch": 0.32314923619271446, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6425484418869019, + "learning_rate": 4.823116064938211e-07, + "loss": 0.0202, + "num_tokens": 37673687.0, + "reward": 0.35296875834465025, + "reward_std": 0.18358486890792847, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.35296875834465025, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2992075264453888, + "sampling/importance_sampling_ratio/max": 1.9819918394088745, + "sampling/importance_sampling_ratio/mean": 1.0000043153762816, + "sampling/importance_sampling_ratio/min": 0.28931107074022294, + "sampling/sampling_logp_difference/max": 1.379153299331665, + "sampling/sampling_logp_difference/mean": 0.017378567531704903, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2150.0, + "completions/max_terminated_length": 2112.6, + "completions/mean_length": 1302.640625, + "completions/mean_terminated_length": 1295.7153076171876, + "completions/min_length": 780.0, + "completions/min_terminated_length": 780.0, + "entropy": 0.39050028920173646, + "epoch": 0.3290246768507638, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.6917908787727356, + "learning_rate": 4.817058395929246e-07, + "loss": -0.0274, + "num_tokens": 38388284.0, + "reward": 0.5204687595367432, + "reward_std": 0.19263336360454558, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5204687595367432, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2920966506004333, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999744057655334, + "sampling/importance_sampling_ratio/min": 0.3137810334563255, + "sampling/sampling_logp_difference/max": 1.2811630487442016, + "sampling/sampling_logp_difference/mean": 0.016145946830511092, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2261.2, + "completions/max_terminated_length": 2259.8, + "completions/mean_length": 1407.540625, + "completions/mean_terminated_length": 1404.1890625, + "completions/min_length": 876.2, + "completions/min_terminated_length": 876.2, + "entropy": 0.40756009221076966, + "epoch": 0.33490011750881316, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.7234519720077515, + "learning_rate": 4.81100072692028e-07, + "loss": 0.0238, + "num_tokens": 39186805.0, + "reward": 0.4428645968437195, + "reward_std": 0.1544080436229706, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4428645968437195, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23549820780754088, + "sampling/importance_sampling_ratio/max": 1.9689576625823975, + "sampling/importance_sampling_ratio/mean": 0.9999301791191101, + "sampling/importance_sampling_ratio/min": 0.1660400189459324, + "sampling/sampling_logp_difference/max": 2.068434953689575, + "sampling/sampling_logp_difference/mean": 0.016894153505563735, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 2325.8, + "completions/max_terminated_length": 2113.4, + "completions/mean_length": 1382.234375, + "completions/mean_terminated_length": 1360.899755859375, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "entropy": 0.4158647537231445, + "epoch": 0.3407755581668625, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.5713046789169312, + "learning_rate": 4.804943057911315e-07, + "loss": -0.0197, + "num_tokens": 39935944.0, + "reward": 0.36718750596046446, + "reward_std": 0.16592961102724074, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.36718750596046446, + "rewards/e2e_recall_precision_mixed_reward/std": 0.299328675866127, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999416589736938, + "sampling/importance_sampling_ratio/min": 0.3071061834692955, + "sampling/sampling_logp_difference/max": 1.781144905090332, + "sampling/sampling_logp_difference/mean": 0.01737259849905968, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2227.0, + "completions/max_terminated_length": 2149.0, + "completions/mean_length": 1325.434375, + "completions/mean_terminated_length": 1311.261669921875, + "completions/min_length": 827.2, + "completions/min_terminated_length": 827.2, + "entropy": 0.40315585732460024, + "epoch": 0.34665099882491185, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.8399536609649658, + "learning_rate": 4.79888538890235e-07, + "loss": -0.0172, + "num_tokens": 40637027.0, + "reward": 0.5668229222297668, + "reward_std": 0.238536736369133, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5668229222297668, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3036984860897064, + "sampling/importance_sampling_ratio/max": 1.9938727378845216, + "sampling/importance_sampling_ratio/mean": 0.9999548077583313, + "sampling/importance_sampling_ratio/min": 0.2874012000946095, + "sampling/sampling_logp_difference/max": 2.626276063919067, + "sampling/sampling_logp_difference/mean": 0.016770557686686515, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2359.0, + "completions/max_terminated_length": 2342.8, + "completions/mean_length": 1378.61875, + "completions/mean_terminated_length": 1365.0980712890625, + "completions/min_length": 871.0, + "completions/min_terminated_length": 871.0, + "entropy": 0.40095179080963134, + "epoch": 0.3525264394829612, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8633328676223755, + "learning_rate": 4.792827719893385e-07, + "loss": 0.0037, + "num_tokens": 41401417.0, + "reward": 0.4402604281902313, + "reward_std": 0.18189195394515992, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4402604281902313, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3080439865589142, + "sampling/importance_sampling_ratio/max": 1.9228517293930054, + "sampling/importance_sampling_ratio/mean": 0.9999574422836304, + "sampling/importance_sampling_ratio/min": 0.27484258711338044, + "sampling/sampling_logp_difference/max": 1.4705661535263062, + "sampling/sampling_logp_difference/mean": 0.016645203903317453, + "step": 300 + }, + { + "epoch": 0.3525264394829612, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.00875, + "eval_completions/max_length": 2255.24, + "eval_completions/max_terminated_length": 2190.56, + "eval_completions/mean_length": 1306.115, + "eval_completions/mean_terminated_length": 1296.024951171875, + "eval_completions/min_length": 795.84, + "eval_completions/min_terminated_length": 795.84, + "eval_entropy": 0.39472726941108705, + "eval_frac_reward_zero_std": 0.14, + "eval_loss": 0.0051555633544921875, + "eval_num_tokens": 41401417.0, + "eval_reward": 0.4641666793823242, + "eval_reward_std": 0.1855441576242447, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.46416668176651, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3040527403354645, + "eval_runtime": 591.545, + "eval_samples_per_second": 0.169, + "eval_sampling/importance_sampling_ratio/max": 1.9726946783065795, + "eval_sampling/importance_sampling_ratio/mean": 1.000031213760376, + "eval_sampling/importance_sampling_ratio/min": 0.2702025346830487, + "eval_sampling/sampling_logp_difference/max": 1.6381762075424193, + "eval_sampling/sampling_logp_difference/mean": 0.016649646908044814, + "eval_steps_per_second": 0.003, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01875, + "completions/max_length": 2440.6, + "completions/max_terminated_length": 2256.6, + "completions/mean_length": 1346.771875, + "completions/mean_terminated_length": 1326.0095458984374, + "completions/min_length": 766.4, + "completions/min_terminated_length": 766.4, + "entropy": 0.3954951822757721, + "epoch": 0.3584018801410106, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.6830638647079468, + "learning_rate": 4.78677005088442e-07, + "loss": -0.0205, + "num_tokens": 42149976.0, + "reward": 0.5190104305744171, + "reward_std": 0.2403048187494278, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5190104246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32388275265693667, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999425649642945, + "sampling/importance_sampling_ratio/min": 0.2714868515729904, + "sampling/sampling_logp_difference/max": 1.3850975036621094, + "sampling/sampling_logp_difference/mean": 0.01655212976038456, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 2315.6, + "completions/max_terminated_length": 2119.4, + "completions/mean_length": 1415.84375, + "completions/mean_terminated_length": 1406.0834228515625, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "entropy": 0.3955690324306488, + "epoch": 0.36427732079905994, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.48863622546195984, + "learning_rate": 4.780712381875454e-07, + "loss": 0.0052, + "num_tokens": 42944954.0, + "reward": 0.36197916865348817, + "reward_std": 0.17809403240680693, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.3619791746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2854657083749771, + "sampling/importance_sampling_ratio/max": 1.989390754699707, + "sampling/importance_sampling_ratio/mean": 0.9999927639961242, + "sampling/importance_sampling_ratio/min": 0.3247060298919678, + "sampling/sampling_logp_difference/max": 1.2260807275772094, + "sampling/sampling_logp_difference/mean": 0.016654501855373382, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2313.8, + "completions/max_terminated_length": 2162.2, + "completions/mean_length": 1346.1375, + "completions/mean_terminated_length": 1327.4448486328124, + "completions/min_length": 794.2, + "completions/min_terminated_length": 794.2, + "entropy": 0.405675995349884, + "epoch": 0.3701527614571093, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.6851405501365662, + "learning_rate": 4.774654712866488e-07, + "loss": -0.0064, + "num_tokens": 43692738.0, + "reward": 0.5561979353427887, + "reward_std": 0.20282128155231477, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5561979293823243, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3183633327484131, + "sampling/importance_sampling_ratio/max": 1.9703486680984497, + "sampling/importance_sampling_ratio/mean": 0.9999482989311218, + "sampling/importance_sampling_ratio/min": 0.26732968389987943, + "sampling/sampling_logp_difference/max": 1.3939155101776124, + "sampling/sampling_logp_difference/mean": 0.016717956587672233, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2268.8, + "completions/max_terminated_length": 2231.6, + "completions/mean_length": 1383.540625, + "completions/mean_terminated_length": 1367.305126953125, + "completions/min_length": 839.8, + "completions/min_terminated_length": 839.8, + "entropy": 0.4027079105377197, + "epoch": 0.37602820211515864, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.903359591960907, + "learning_rate": 4.768597043857523e-07, + "loss": -0.0064, + "num_tokens": 44430635.0, + "reward": 0.45427083373069765, + "reward_std": 0.19985013008117675, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.45427083373069765, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26199466586112974, + "sampling/importance_sampling_ratio/max": 1.9889472484588624, + "sampling/importance_sampling_ratio/mean": 1.0000965476036072, + "sampling/importance_sampling_ratio/min": 0.2858219683170319, + "sampling/sampling_logp_difference/max": 1.5289599657058717, + "sampling/sampling_logp_difference/mean": 0.016677992790937422, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2299.2, + "completions/max_terminated_length": 2297.4, + "completions/mean_length": 1311.9625, + "completions/mean_terminated_length": 1308.5442626953125, + "completions/min_length": 867.2, + "completions/min_terminated_length": 867.2, + "entropy": 0.411800742149353, + "epoch": 0.381903642773208, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.8514150381088257, + "learning_rate": 4.7625393748485583e-07, + "loss": 0.0109, + "num_tokens": 45151851.0, + "reward": 0.48619791865348816, + "reward_std": 0.19733970016241073, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.48619791865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.310748428106308, + "sampling/importance_sampling_ratio/max": 1.9813976764678956, + "sampling/importance_sampling_ratio/mean": 1.000037384033203, + "sampling/importance_sampling_ratio/min": 0.2959585070610046, + "sampling/sampling_logp_difference/max": 1.2915854692459106, + "sampling/sampling_logp_difference/mean": 0.017146169394254684, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2122.4, + "completions/max_terminated_length": 2083.8, + "completions/mean_length": 1377.2875, + "completions/mean_terminated_length": 1365.130322265625, + "completions/min_length": 872.2, + "completions/min_terminated_length": 872.2, + "entropy": 0.39033161997795107, + "epoch": 0.38777908343125733, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.7350701093673706, + "learning_rate": 4.7564817058395926e-07, + "loss": -0.009, + "num_tokens": 45916567.0, + "reward": 0.39609376192092893, + "reward_std": 0.19951523691415787, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.39609376192092893, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3023851901292801, + "sampling/importance_sampling_ratio/max": 1.9538982629776, + "sampling/importance_sampling_ratio/mean": 0.9999711871147156, + "sampling/importance_sampling_ratio/min": 0.15197787082288414, + "sampling/sampling_logp_difference/max": 2.7049013137817384, + "sampling/sampling_logp_difference/mean": 0.01634741071611643, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2365.0, + "completions/max_terminated_length": 2233.2, + "completions/mean_length": 1297.45, + "completions/mean_terminated_length": 1281.5479736328125, + "completions/min_length": 830.0, + "completions/min_terminated_length": 830.0, + "entropy": 0.40079636573791505, + "epoch": 0.3936545240893067, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.6987754702568054, + "learning_rate": 4.7504240368306275e-07, + "loss": 0.0049, + "num_tokens": 46645495.0, + "reward": 0.49000000655651094, + "reward_std": 0.14566230401396751, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.49000000953674316, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23847460746765137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000020718574524, + "sampling/importance_sampling_ratio/min": 0.24331406950950624, + "sampling/sampling_logp_difference/max": 1.6468490839004517, + "sampling/sampling_logp_difference/mean": 0.01682475283741951, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2367.6, + "completions/max_terminated_length": 2156.2, + "completions/mean_length": 1326.51875, + "completions/mean_terminated_length": 1309.7556640625, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.3823336660861969, + "epoch": 0.399529964747356, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.6209218502044678, + "learning_rate": 4.7443663678216624e-07, + "loss": 0.01, + "num_tokens": 47367257.0, + "reward": 0.5150520920753479, + "reward_std": 0.22551013827323912, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5150520920753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3198787569999695, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999925410747528, + "sampling/importance_sampling_ratio/min": 0.36856586337089536, + "sampling/sampling_logp_difference/max": 1.3121527194976808, + "sampling/sampling_logp_difference/mean": 0.01604050975292921, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1965.8, + "completions/max_terminated_length": 1965.8, + "completions/mean_length": 1185.409375, + "completions/mean_terminated_length": 1185.409375, + "completions/min_length": 730.8, + "completions/min_terminated_length": 730.8, + "entropy": 0.38586640954017637, + "epoch": 0.40540540540540543, + "frac_reward_zero_std": 0.05, + "grad_norm": 1.4014785289764404, + "learning_rate": 4.738308698812697e-07, + "loss": 0.0466, + "num_tokens": 48037180.0, + "reward": 0.5460416734218597, + "reward_std": 0.19236631989479064, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5460416972637177, + "rewards/e2e_recall_precision_mixed_reward/std": 0.292988184094429, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999979043006897, + "sampling/importance_sampling_ratio/min": 0.29654269516468046, + "sampling/sampling_logp_difference/max": 1.231632113456726, + "sampling/sampling_logp_difference/mean": 0.016064048185944556, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 2091.2, + "completions/max_terminated_length": 2082.6, + "completions/mean_length": 1254.175, + "completions/mean_terminated_length": 1243.588623046875, + "completions/min_length": 818.2, + "completions/min_terminated_length": 818.2, + "entropy": 0.3805388808250427, + "epoch": 0.4112808460634548, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.7526155710220337, + "learning_rate": 4.7322510298037317e-07, + "loss": 0.0057, + "num_tokens": 48776344.0, + "reward": 0.4618229269981384, + "reward_std": 0.15769713521003723, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4618229269981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27449490427970885, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000930905342102, + "sampling/importance_sampling_ratio/min": 0.2510767489671707, + "sampling/sampling_logp_difference/max": 2.2086389303207397, + "sampling/sampling_logp_difference/mean": 0.015983594954013823, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2081.6, + "completions/max_terminated_length": 2033.8, + "completions/mean_length": 1256.4, + "completions/mean_terminated_length": 1252.8534912109376, + "completions/min_length": 803.2, + "completions/min_terminated_length": 803.2, + "entropy": 0.36754541397094725, + "epoch": 0.4171562867215041, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.5367698669433594, + "learning_rate": 4.7261933607947655e-07, + "loss": -0.013, + "num_tokens": 49508964.0, + "reward": 0.5018229305744171, + "reward_std": 0.1739724576473236, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5018229305744171, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2954780399799347, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998829245567322, + "sampling/importance_sampling_ratio/min": 0.27626035958528516, + "sampling/sampling_logp_difference/max": 1.4486989021301269, + "sampling/sampling_logp_difference/mean": 0.015474473871290684, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2188.4, + "completions/max_terminated_length": 2184.4, + "completions/mean_length": 1234.028125, + "completions/mean_terminated_length": 1226.1088623046876, + "completions/min_length": 753.4, + "completions/min_terminated_length": 753.4, + "entropy": 0.38073245286941526, + "epoch": 0.42303172737955347, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7556249499320984, + "learning_rate": 4.7201356917858004e-07, + "loss": -0.0105, + "num_tokens": 50218789.0, + "reward": 0.42156251668930056, + "reward_std": 0.1456248864531517, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.421562522649765, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23054299950599672, + "sampling/importance_sampling_ratio/max": 1.9770933151245118, + "sampling/importance_sampling_ratio/mean": 1.0000459671020507, + "sampling/importance_sampling_ratio/min": 0.3449200510978699, + "sampling/sampling_logp_difference/max": 1.3223530054092407, + "sampling/sampling_logp_difference/mean": 0.0159611064940691, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1993.4, + "completions/max_terminated_length": 1993.4, + "completions/mean_length": 1205.571875, + "completions/mean_terminated_length": 1205.571875, + "completions/min_length": 794.4, + "completions/min_terminated_length": 794.4, + "entropy": 0.390489786863327, + "epoch": 0.4289071680376028, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.9579271078109741, + "learning_rate": 4.7140780227768353e-07, + "loss": 0.0223, + "num_tokens": 50928908.0, + "reward": 0.537968760728836, + "reward_std": 0.21120007038116456, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5379687666893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2601155489683151, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999605894088746, + "sampling/importance_sampling_ratio/min": 0.23600251823663712, + "sampling/sampling_logp_difference/max": 1.7278624057769776, + "sampling/sampling_logp_difference/mean": 0.016414126753807066, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 1030.24375, + "completions/mean_terminated_length": 1030.24375, + "completions/min_length": 696.4, + "completions/min_terminated_length": 696.4, + "entropy": 0.3575406074523926, + "epoch": 0.43478260869565216, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.7329716086387634, + "learning_rate": 4.7080203537678697e-07, + "loss": 0.0058, + "num_tokens": 51567930.0, + "reward": 0.5391145884990692, + "reward_std": 0.19752640128135682, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5391145884990692, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27296304106712344, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999308943748474, + "sampling/importance_sampling_ratio/min": 0.2641163617372513, + "sampling/sampling_logp_difference/max": 1.4609861969947815, + "sampling/sampling_logp_difference/mean": 0.015179168432950974, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1568.4, + "completions/max_terminated_length": 1568.4, + "completions/mean_length": 920.165625, + "completions/mean_terminated_length": 920.165625, + "completions/min_length": 506.6, + "completions/min_terminated_length": 506.6, + "entropy": 0.39158068895339965, + "epoch": 0.4406580493537015, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8551925420761108, + "learning_rate": 4.7019626847589046e-07, + "loss": 0.0294, + "num_tokens": 52161087.0, + "reward": 0.5165625095367432, + "reward_std": 0.19143509566783906, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5165625095367432, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3149138242006302, + "sampling/importance_sampling_ratio/max": 1.9255353212356567, + "sampling/importance_sampling_ratio/mean": 1.0001346826553346, + "sampling/importance_sampling_ratio/min": 0.2629414364695549, + "sampling/sampling_logp_difference/max": 1.4847426176071168, + "sampling/sampling_logp_difference/mean": 0.016572076827287674, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1716.4, + "completions/max_terminated_length": 1716.4, + "completions/mean_length": 999.54375, + "completions/mean_terminated_length": 999.54375, + "completions/min_length": 445.2, + "completions/min_terminated_length": 445.2, + "entropy": 0.3913616418838501, + "epoch": 0.44653349001175086, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.8592411279678345, + "learning_rate": 4.695905015749939e-07, + "loss": 0.021, + "num_tokens": 52813725.0, + "reward": 0.5802604496479035, + "reward_std": 0.22083891928195953, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5802604377269744, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3036410689353943, + "sampling/importance_sampling_ratio/max": 1.9878347158432006, + "sampling/importance_sampling_ratio/mean": 0.9998912930488586, + "sampling/importance_sampling_ratio/min": 0.2878791332244873, + "sampling/sampling_logp_difference/max": 1.5880217552185059, + "sampling/sampling_logp_difference/mean": 0.01670288797467947, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1729.8, + "completions/max_terminated_length": 1729.8, + "completions/mean_length": 996.284375, + "completions/mean_terminated_length": 996.284375, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "entropy": 0.3791182518005371, + "epoch": 0.45240893066980026, + "frac_reward_zero_std": 0.1, + "grad_norm": 1.009487271308899, + "learning_rate": 4.689847346740974e-07, + "loss": 0.0119, + "num_tokens": 53446856.0, + "reward": 0.567864590883255, + "reward_std": 0.2009609282016754, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.567864590883255, + "rewards/e2e_recall_precision_mixed_reward/std": 0.331317538022995, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000531435012818, + "sampling/importance_sampling_ratio/min": 0.36831892728805543, + "sampling/sampling_logp_difference/max": 1.1540207147598267, + "sampling/sampling_logp_difference/mean": 0.01622724235057831, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1572.2, + "completions/max_terminated_length": 1572.2, + "completions/mean_length": 987.134375, + "completions/mean_terminated_length": 987.134375, + "completions/min_length": 575.0, + "completions/min_terminated_length": 575.0, + "entropy": 0.3846351742744446, + "epoch": 0.4582843713278496, + "frac_reward_zero_std": 0.1, + "grad_norm": 1.2462704181671143, + "learning_rate": 4.6837896777320087e-07, + "loss": -0.0292, + "num_tokens": 54089075.0, + "reward": 0.5090625166893006, + "reward_std": 0.2253621369600296, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5090625166893006, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32490702867507937, + "sampling/importance_sampling_ratio/max": 1.8669166564941406, + "sampling/importance_sampling_ratio/mean": 1.0000983238220216, + "sampling/importance_sampling_ratio/min": 0.2756506517529488, + "sampling/sampling_logp_difference/max": 1.474560058116913, + "sampling/sampling_logp_difference/mean": 0.016727247275412083, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1904.6, + "completions/max_terminated_length": 1772.6, + "completions/mean_length": 1097.10625, + "completions/mean_terminated_length": 1092.751806640625, + "completions/min_length": 723.4, + "completions/min_terminated_length": 723.4, + "entropy": 0.37217140197753906, + "epoch": 0.46415981198589895, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.9669972062110901, + "learning_rate": 4.677732008723043e-07, + "loss": -0.0141, + "num_tokens": 54781809.0, + "reward": 0.4870833456516266, + "reward_std": 0.15707829147577285, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4870833456516266, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30188499987125395, + "sampling/importance_sampling_ratio/max": 1.9380834102630615, + "sampling/importance_sampling_ratio/mean": 0.9999426603317261, + "sampling/importance_sampling_ratio/min": 0.24590765461325645, + "sampling/sampling_logp_difference/max": 1.9788538694381714, + "sampling/sampling_logp_difference/mean": 0.01591875497251749, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.6, + "completions/max_terminated_length": 1558.6, + "completions/mean_length": 1044.515625, + "completions/mean_terminated_length": 1044.515625, + "completions/min_length": 680.8, + "completions/min_terminated_length": 680.8, + "entropy": 0.3540132224559784, + "epoch": 0.4700352526439483, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.9251359105110168, + "learning_rate": 4.671674339714078e-07, + "loss": -0.0023, + "num_tokens": 55438886.0, + "reward": 0.5378125071525574, + "reward_std": 0.18125876784324646, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5378125071525574, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30963089168071745, + "sampling/importance_sampling_ratio/max": 1.9805966854095458, + "sampling/importance_sampling_ratio/mean": 0.9999330878257752, + "sampling/importance_sampling_ratio/min": 0.2766235023736954, + "sampling/sampling_logp_difference/max": 1.4273133754730225, + "sampling/sampling_logp_difference/mean": 0.015250829048454762, + "step": 400 + }, + { + "epoch": 0.4700352526439483, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1663.24, + "eval_completions/max_terminated_length": 1663.24, + "eval_completions/mean_length": 1068.386875, + "eval_completions/mean_terminated_length": 1068.386875, + "eval_completions/min_length": 682.2, + "eval_completions/min_terminated_length": 682.2, + "eval_entropy": 0.37045517563819885, + "eval_frac_reward_zero_std": 0.18, + "eval_loss": 0.010637059807777405, + "eval_num_tokens": 55438886.0, + "eval_reward": 0.48254167914390567, + "eval_reward_std": 0.171167514026165, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.4825416815280914, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29884088337421416, + "eval_runtime": 435.7442, + "eval_samples_per_second": 0.229, + "eval_sampling/importance_sampling_ratio/max": 1.9646133232116698, + "eval_sampling/importance_sampling_ratio/mean": 0.999952335357666, + "eval_sampling/importance_sampling_ratio/min": 0.3115068358182907, + "eval_sampling/sampling_logp_difference/max": 1.2885041189193727, + "eval_sampling/sampling_logp_difference/mean": 0.015734767876565456, + "eval_steps_per_second": 0.005, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1792.6, + "completions/max_terminated_length": 1771.2, + "completions/mean_length": 1133.459375, + "completions/mean_terminated_length": 1129.54833984375, + "completions/min_length": 769.6, + "completions/min_terminated_length": 769.6, + "entropy": 0.361786413192749, + "epoch": 0.47591069330199764, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6992851495742798, + "learning_rate": 4.6656166707051123e-07, + "loss": 0.0071, + "num_tokens": 56134341.0, + "reward": 0.5578646063804626, + "reward_std": 0.1855187177658081, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5578646123409271, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28627926409244536, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000202178955078, + "sampling/importance_sampling_ratio/min": 0.3811205953359604, + "sampling/sampling_logp_difference/max": 1.274878692626953, + "sampling/sampling_logp_difference/mean": 0.015369786508381366, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1997.2, + "completions/max_terminated_length": 1997.2, + "completions/mean_length": 1159.346875, + "completions/mean_terminated_length": 1159.346875, + "completions/min_length": 732.4, + "completions/min_terminated_length": 732.4, + "entropy": 0.3553011953830719, + "epoch": 0.481786133960047, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.205419898033142, + "learning_rate": 4.659559001696147e-07, + "loss": 0.0217, + "num_tokens": 56814612.0, + "reward": 0.5170312762260437, + "reward_std": 0.17431987226009368, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5170312762260437, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2719025552272797, + "sampling/importance_sampling_ratio/max": 1.979778790473938, + "sampling/importance_sampling_ratio/mean": 0.9998809933662415, + "sampling/importance_sampling_ratio/min": 0.3756587505340576, + "sampling/sampling_logp_difference/max": 1.0862587213516235, + "sampling/sampling_logp_difference/mean": 0.014951322041451932, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1706.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 1093.56875, + "completions/mean_terminated_length": 1093.56875, + "completions/min_length": 690.4, + "completions/min_terminated_length": 690.4, + "entropy": 0.36290356516838074, + "epoch": 0.48766157461809634, + "frac_reward_zero_std": 0.2, + "grad_norm": 1.1141079664230347, + "learning_rate": 4.653501332687182e-07, + "loss": 0.0016, + "num_tokens": 57490778.0, + "reward": 0.47807292342185975, + "reward_std": 0.16014991998672484, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.47807292342185975, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29495506882667544, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000198245048524, + "sampling/importance_sampling_ratio/min": 0.29137383252382276, + "sampling/sampling_logp_difference/max": 1.4087148904800415, + "sampling/sampling_logp_difference/mean": 0.015364770777523518, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1689.2, + "completions/max_terminated_length": 1689.2, + "completions/mean_length": 1061.3875, + "completions/mean_terminated_length": 1061.3875, + "completions/min_length": 670.6, + "completions/min_terminated_length": 670.6, + "entropy": 0.3800378322601318, + "epoch": 0.4935370152761457, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8299875855445862, + "learning_rate": 4.6474436636782165e-07, + "loss": 0.002, + "num_tokens": 58153094.0, + "reward": 0.5261979281902314, + "reward_std": 0.15056960582733153, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5261979281902314, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28166911005973816, + "sampling/importance_sampling_ratio/max": 1.9816663026809693, + "sampling/importance_sampling_ratio/mean": 1.0000874996185303, + "sampling/importance_sampling_ratio/min": 0.35656355023384095, + "sampling/sampling_logp_difference/max": 1.1156534433364869, + "sampling/sampling_logp_difference/mean": 0.015891117975115776, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1809.4, + "completions/max_terminated_length": 1809.4, + "completions/mean_length": 1166.70625, + "completions/mean_terminated_length": 1166.70625, + "completions/min_length": 792.6, + "completions/min_terminated_length": 792.6, + "entropy": 0.3703398644924164, + "epoch": 0.4994124559341951, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7483147978782654, + "learning_rate": 4.6413859946692514e-07, + "loss": 0.0159, + "num_tokens": 58863832.0, + "reward": 0.4547916650772095, + "reward_std": 0.09739691466093063, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.45479167699813844, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23734066933393477, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999337792396545, + "sampling/importance_sampling_ratio/min": 0.31920480728149414, + "sampling/sampling_logp_difference/max": 1.3080387592315674, + "sampling/sampling_logp_difference/mean": 0.015443827025592327, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1885.2, + "completions/max_terminated_length": 1781.4, + "completions/mean_length": 1171.234375, + "completions/mean_terminated_length": 1158.0830810546875, + "completions/min_length": 709.6, + "completions/min_terminated_length": 709.6, + "entropy": 0.36316835284233095, + "epoch": 0.5052878965922444, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.9235474467277527, + "learning_rate": 4.635328325660286e-07, + "loss": -0.0112, + "num_tokens": 59545799.0, + "reward": 0.5247916758060456, + "reward_std": 0.17066848278045654, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.52479168176651, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2839739441871643, + "sampling/importance_sampling_ratio/max": 1.9724763870239257, + "sampling/importance_sampling_ratio/mean": 0.999966812133789, + "sampling/importance_sampling_ratio/min": 0.2238582156598568, + "sampling/sampling_logp_difference/max": 1.789958667755127, + "sampling/sampling_logp_difference/mean": 0.01525730974972248, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1808.8, + "completions/max_terminated_length": 1808.8, + "completions/mean_length": 1119.996875, + "completions/mean_terminated_length": 1119.996875, + "completions/min_length": 755.0, + "completions/min_terminated_length": 755.0, + "entropy": 0.3681009292602539, + "epoch": 0.5111633372502937, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.8923326730728149, + "learning_rate": 4.62927065665132e-07, + "loss": 0.01, + "num_tokens": 60215142.0, + "reward": 0.521875, + "reward_std": 0.20172743797302245, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5218750059604644, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31086271703243257, + "sampling/importance_sampling_ratio/max": 1.9483107328414917, + "sampling/importance_sampling_ratio/mean": 0.9998976588249207, + "sampling/importance_sampling_ratio/min": 0.3355667650699615, + "sampling/sampling_logp_difference/max": 1.200546884536743, + "sampling/sampling_logp_difference/mean": 0.01538294218480587, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1521.8, + "completions/max_terminated_length": 1521.8, + "completions/mean_length": 1014.690625, + "completions/mean_terminated_length": 1014.690625, + "completions/min_length": 678.4, + "completions/min_terminated_length": 678.4, + "entropy": 0.374547415971756, + "epoch": 0.5170387779083431, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.7466691136360168, + "learning_rate": 4.623212987642355e-07, + "loss": -0.0055, + "num_tokens": 60849251.0, + "reward": 0.627343761920929, + "reward_std": 0.18858850598335267, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.627343761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30483335852622984, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000047564506531, + "sampling/importance_sampling_ratio/min": 0.30364882200956345, + "sampling/sampling_logp_difference/max": 1.9110549449920655, + "sampling/sampling_logp_difference/mean": 0.015604251064360142, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1848.4, + "completions/max_terminated_length": 1848.4, + "completions/mean_length": 1151.296875, + "completions/mean_terminated_length": 1151.296875, + "completions/min_length": 780.2, + "completions/min_terminated_length": 780.2, + "entropy": 0.3709349751472473, + "epoch": 0.5229142185663925, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8087165355682373, + "learning_rate": 4.6171553186333894e-07, + "loss": 0.0058, + "num_tokens": 61540146.0, + "reward": 0.5669270932674408, + "reward_std": 0.17213488817214967, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5669270932674408, + "rewards/e2e_recall_precision_mixed_reward/std": 0.270868119597435, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000919818878173, + "sampling/importance_sampling_ratio/min": 0.3365716278553009, + "sampling/sampling_logp_difference/max": 1.2326734781265258, + "sampling/sampling_logp_difference/mean": 0.015531861409544944, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1881.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1190.221875, + "completions/mean_terminated_length": 1190.221875, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 0.3719481110572815, + "epoch": 0.5287896592244419, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.7721225619316101, + "learning_rate": 4.611097649624424e-07, + "loss": 0.0083, + "num_tokens": 62243545.0, + "reward": 0.4819791793823242, + "reward_std": 0.2040602147579193, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4819791793823242, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27713783383369445, + "sampling/importance_sampling_ratio/max": 1.9724904775619507, + "sampling/importance_sampling_ratio/mean": 1.000051498413086, + "sampling/importance_sampling_ratio/min": 0.29547479525208475, + "sampling/sampling_logp_difference/max": 1.5890276193618775, + "sampling/sampling_logp_difference/mean": 0.015463878214359284, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1946.0, + "completions/max_terminated_length": 1804.2, + "completions/mean_length": 1184.165625, + "completions/mean_terminated_length": 1180.2292724609374, + "completions/min_length": 746.6, + "completions/min_terminated_length": 746.6, + "entropy": 0.36654204726219175, + "epoch": 0.5346650998824912, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.6808129549026489, + "learning_rate": 4.6050399806154586e-07, + "loss": 0.0152, + "num_tokens": 62939866.0, + "reward": 0.46161459684371947, + "reward_std": 0.21264611780643464, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.46161459684371947, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30921355485916135, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999315500259399, + "sampling/importance_sampling_ratio/min": 0.19656258896866347, + "sampling/sampling_logp_difference/max": 2.8585072994232177, + "sampling/sampling_logp_difference/mean": 0.015469144657254219, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1971.4, + "completions/max_terminated_length": 1971.4, + "completions/mean_length": 1279.0875, + "completions/mean_terminated_length": 1279.0875, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.3922215819358826, + "epoch": 0.5405405405405406, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7750838398933411, + "learning_rate": 4.5989823116064935e-07, + "loss": 0.0255, + "num_tokens": 63669686.0, + "reward": 0.5097916722297668, + "reward_std": 0.15973457396030427, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5097916722297668, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2736150071024895, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999813556671142, + "sampling/importance_sampling_ratio/min": 0.093977015838027, + "sampling/sampling_logp_difference/max": 2.793697214126587, + "sampling/sampling_logp_difference/mean": 0.015985237061977388, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1971.0, + "completions/max_terminated_length": 1971.0, + "completions/mean_length": 1246.646875, + "completions/mean_terminated_length": 1246.646875, + "completions/min_length": 780.6, + "completions/min_terminated_length": 780.6, + "entropy": 0.38005300164222716, + "epoch": 0.5464159811985899, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7596340775489807, + "learning_rate": 4.5929246425975284e-07, + "loss": 0.0204, + "num_tokens": 64388949.0, + "reward": 0.5148958444595337, + "reward_std": 0.17929587066173552, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5148958444595337, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2609217405319214, + "sampling/importance_sampling_ratio/max": 1.9646515369415283, + "sampling/importance_sampling_ratio/mean": 1.0001105666160583, + "sampling/importance_sampling_ratio/min": 0.3292378157377243, + "sampling/sampling_logp_difference/max": 1.3221782445907593, + "sampling/sampling_logp_difference/mean": 0.015639954805374147, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1978.4, + "completions/max_terminated_length": 1978.4, + "completions/mean_length": 1199.6125, + "completions/mean_terminated_length": 1199.6125, + "completions/min_length": 725.4, + "completions/min_terminated_length": 725.4, + "entropy": 0.3578630328178406, + "epoch": 0.5522914218566393, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.5981117486953735, + "learning_rate": 4.586866973588563e-07, + "loss": 0.0069, + "num_tokens": 65093913.0, + "reward": 0.6246875107288361, + "reward_std": 0.18131420761346817, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6246875107288361, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28334271609783174, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999997079372406, + "sampling/importance_sampling_ratio/min": 0.29355984926223755, + "sampling/sampling_logp_difference/max": 1.2378879070281983, + "sampling/sampling_logp_difference/mean": 0.015142908878624439, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1809.0, + "completions/max_terminated_length": 1809.0, + "completions/mean_length": 1175.615625, + "completions/mean_terminated_length": 1175.615625, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "entropy": 0.3991717994213104, + "epoch": 0.5581668625146886, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.9285194873809814, + "learning_rate": 4.5808093045795977e-07, + "loss": 0.0292, + "num_tokens": 65823646.0, + "reward": 0.5126562476158142, + "reward_std": 0.21755965948104858, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5126562535762786, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29003084897994996, + "sampling/importance_sampling_ratio/max": 1.9938787698745728, + "sampling/importance_sampling_ratio/mean": 0.9999786853790283, + "sampling/importance_sampling_ratio/min": 0.318202418088913, + "sampling/sampling_logp_difference/max": 1.212429451942444, + "sampling/sampling_logp_difference/mean": 0.01659090295433998, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2030.6, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1207.65, + "completions/mean_terminated_length": 1204.127392578125, + "completions/min_length": 737.6, + "completions/min_terminated_length": 737.6, + "entropy": 0.3959426462650299, + "epoch": 0.564042303172738, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.5899296402931213, + "learning_rate": 4.574751635570632e-07, + "loss": 0.0117, + "num_tokens": 66513274.0, + "reward": 0.5737500190734863, + "reward_std": 0.19729947447776794, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5737500250339508, + "rewards/e2e_recall_precision_mixed_reward/std": 0.287484747171402, + "sampling/importance_sampling_ratio/max": 1.983570671081543, + "sampling/importance_sampling_ratio/mean": 0.9999786853790283, + "sampling/importance_sampling_ratio/min": 0.32921838760375977, + "sampling/sampling_logp_difference/max": 1.1545652389526366, + "sampling/sampling_logp_difference/mean": 0.016338953003287315, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2066.4, + "completions/max_terminated_length": 1972.4, + "completions/mean_length": 1254.078125, + "completions/mean_terminated_length": 1250.579541015625, + "completions/min_length": 652.0, + "completions/min_terminated_length": 652.0, + "entropy": 0.4095371961593628, + "epoch": 0.5699177438307873, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.8716078996658325, + "learning_rate": 4.568693966561667e-07, + "loss": -0.0075, + "num_tokens": 67238959.0, + "reward": 0.5467187583446502, + "reward_std": 0.21395463943481446, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5467187702655792, + "rewards/e2e_recall_precision_mixed_reward/std": 0.306264591217041, + "sampling/importance_sampling_ratio/max": 1.9498302221298218, + "sampling/importance_sampling_ratio/mean": 1.0000301122665405, + "sampling/importance_sampling_ratio/min": 0.2736341401934624, + "sampling/sampling_logp_difference/max": 1.681512475013733, + "sampling/sampling_logp_difference/mean": 0.016801463067531587, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2096.6, + "completions/max_terminated_length": 2082.2, + "completions/mean_length": 1255.821875, + "completions/mean_terminated_length": 1252.06220703125, + "completions/min_length": 655.2, + "completions/min_terminated_length": 655.2, + "entropy": 0.40909390449523925, + "epoch": 0.5757931844888367, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6042861342430115, + "learning_rate": 4.562636297552702e-07, + "loss": 0.0145, + "num_tokens": 67953666.0, + "reward": 0.5453646063804627, + "reward_std": 0.17147594094276428, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5453646063804627, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25718378126621244, + "sampling/importance_sampling_ratio/max": 1.90937283039093, + "sampling/importance_sampling_ratio/mean": 1.000064730644226, + "sampling/importance_sampling_ratio/min": 0.350798898935318, + "sampling/sampling_logp_difference/max": 1.1801783084869384, + "sampling/sampling_logp_difference/mean": 0.016766057163476945, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2113.2, + "completions/max_terminated_length": 2105.4, + "completions/mean_length": 1227.88125, + "completions/mean_terminated_length": 1209.488232421875, + "completions/min_length": 691.8, + "completions/min_terminated_length": 691.8, + "entropy": 0.4179535210132599, + "epoch": 0.581668625146886, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.7605124115943909, + "learning_rate": 4.556578628543736e-07, + "loss": 0.0052, + "num_tokens": 68652744.0, + "reward": 0.565625011920929, + "reward_std": 0.1908968836069107, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.565625011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28528952896595, + "sampling/importance_sampling_ratio/max": 1.9585810661315919, + "sampling/importance_sampling_ratio/mean": 1.0000707626342773, + "sampling/importance_sampling_ratio/min": 0.354918098449707, + "sampling/sampling_logp_difference/max": 1.057307195663452, + "sampling/sampling_logp_difference/mean": 0.017101043835282326, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1780.4, + "completions/max_terminated_length": 1780.4, + "completions/mean_length": 1157.896875, + "completions/mean_terminated_length": 1157.896875, + "completions/min_length": 676.2, + "completions/min_terminated_length": 676.2, + "entropy": 0.4073349952697754, + "epoch": 0.5875440658049353, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.7811326384544373, + "learning_rate": 4.550520959534771e-07, + "loss": 0.0204, + "num_tokens": 69374967.0, + "reward": 0.5698958396911621, + "reward_std": 0.1886465221643448, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5698958396911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3525637865066528, + "sampling/importance_sampling_ratio/max": 1.851768970489502, + "sampling/importance_sampling_ratio/mean": 1.0000535249710083, + "sampling/importance_sampling_ratio/min": 0.42424078583717345, + "sampling/sampling_logp_difference/max": 1.0173060417175293, + "sampling/sampling_logp_difference/mean": 0.016810437291860582, + "step": 500 + }, + { + "epoch": 0.5875440658049353, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.000625, + "eval_completions/max_length": 1826.52, + "eval_completions/max_terminated_length": 1794.64, + "eval_completions/mean_length": 1125.531875, + "eval_completions/mean_terminated_length": 1124.5743969726564, + "eval_completions/min_length": 649.08, + "eval_completions/min_terminated_length": 649.08, + "eval_entropy": 0.3993370485305786, + "eval_frac_reward_zero_std": 0.2, + "eval_loss": -0.0014770731795579195, + "eval_num_tokens": 69374967.0, + "eval_reward": 0.5213125109672546, + "eval_reward_std": 0.19255633950233458, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.5213125121593475, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3224320185184479, + "eval_runtime": 482.9379, + "eval_samples_per_second": 0.207, + "eval_sampling/importance_sampling_ratio/max": 1.9666599893569947, + "eval_sampling/importance_sampling_ratio/mean": 0.999978015422821, + "eval_sampling/importance_sampling_ratio/min": 0.32408329740166664, + "eval_sampling/sampling_logp_difference/max": 1.3359552669525145, + "eval_sampling/sampling_logp_difference/mean": 0.01666470244526863, + "eval_steps_per_second": 0.004, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2121.8, + "completions/max_terminated_length": 2065.2, + "completions/mean_length": 1188.903125, + "completions/mean_terminated_length": 1169.357568359375, + "completions/min_length": 606.6, + "completions/min_terminated_length": 606.6, + "entropy": 0.41026775240898133, + "epoch": 0.5934195064629847, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.9700555205345154, + "learning_rate": 4.5444632905258054e-07, + "loss": -0.0078, + "num_tokens": 70076500.0, + "reward": 0.47963541746139526, + "reward_std": 0.24116905331611632, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.4796354353427887, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3578904628753662, + "sampling/importance_sampling_ratio/max": 1.9784984588623047, + "sampling/importance_sampling_ratio/mean": 0.9999493837356568, + "sampling/importance_sampling_ratio/min": 0.2738966698758304, + "sampling/sampling_logp_difference/max": 1.9038654685020446, + "sampling/sampling_logp_difference/mean": 0.017127976939082145, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1973.2, + "completions/max_terminated_length": 1933.2, + "completions/mean_length": 1116.7625, + "completions/mean_terminated_length": 1105.833837890625, + "completions/min_length": 571.4, + "completions/min_terminated_length": 571.4, + "entropy": 0.3997494399547577, + "epoch": 0.599294947121034, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.8417585492134094, + "learning_rate": 4.5384056215168403e-07, + "loss": -0.0091, + "num_tokens": 70751708.0, + "reward": 0.5510416805744172, + "reward_std": 0.24827665388584136, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5510416984558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.35132635235786436, + "sampling/importance_sampling_ratio/max": 1.9057066679000854, + "sampling/importance_sampling_ratio/mean": 1.00001118183136, + "sampling/importance_sampling_ratio/min": 0.34424024224281313, + "sampling/sampling_logp_difference/max": 1.115197741985321, + "sampling/sampling_logp_difference/mean": 0.016710634157061578, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2148.0, + "completions/max_terminated_length": 2118.6, + "completions/mean_length": 1253.075, + "completions/mean_terminated_length": 1249.4044189453125, + "completions/min_length": 749.8, + "completions/min_terminated_length": 749.8, + "entropy": 0.39709330797195436, + "epoch": 0.6051703877790834, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.8181708455085754, + "learning_rate": 4.5323479525078747e-07, + "loss": -0.0217, + "num_tokens": 71472448.0, + "reward": 0.41421875953674314, + "reward_std": 0.18146341145038605, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.41421875953674314, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2794097185134888, + "sampling/importance_sampling_ratio/max": 1.890633797645569, + "sampling/importance_sampling_ratio/mean": 0.9998977184295654, + "sampling/importance_sampling_ratio/min": 0.361523362994194, + "sampling/sampling_logp_difference/max": 1.0541168928146363, + "sampling/sampling_logp_difference/mean": 0.01627396307885647, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1854.6, + "completions/max_terminated_length": 1854.6, + "completions/mean_length": 1165.190625, + "completions/mean_terminated_length": 1165.190625, + "completions/min_length": 655.6, + "completions/min_terminated_length": 655.6, + "entropy": 0.3984744131565094, + "epoch": 0.6110458284371327, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.802769660949707, + "learning_rate": 4.526290283498909e-07, + "loss": 0.0145, + "num_tokens": 72225469.0, + "reward": 0.48208335041999817, + "reward_std": 0.20398985743522643, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.48208335041999817, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31512056589126586, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999612689018249, + "sampling/importance_sampling_ratio/min": 0.31723029613494874, + "sampling/sampling_logp_difference/max": 1.2719123601913451, + "sampling/sampling_logp_difference/mean": 0.016700203344225884, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1694.2, + "completions/max_terminated_length": 1694.2, + "completions/mean_length": 1077.79375, + "completions/mean_terminated_length": 1077.79375, + "completions/min_length": 639.2, + "completions/min_terminated_length": 639.2, + "entropy": 0.390575510263443, + "epoch": 0.6169212690951822, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8257818222045898, + "learning_rate": 4.520232614489944e-07, + "loss": -0.0109, + "num_tokens": 72919387.0, + "reward": 0.6024479389190673, + "reward_std": 0.18384309411048888, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6024479389190673, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29759698510169985, + "sampling/importance_sampling_ratio/max": 1.951656460762024, + "sampling/importance_sampling_ratio/mean": 0.9999419927597046, + "sampling/importance_sampling_ratio/min": 0.4299475193023682, + "sampling/sampling_logp_difference/max": 0.8493948101997375, + "sampling/sampling_logp_difference/mean": 0.016626672819256783, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.4, + "completions/max_terminated_length": 1810.4, + "completions/mean_length": 1170.88125, + "completions/mean_terminated_length": 1170.88125, + "completions/min_length": 722.6, + "completions/min_terminated_length": 722.6, + "entropy": 0.37268310189247134, + "epoch": 0.6227967097532315, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.9070501923561096, + "learning_rate": 4.5141749454809783e-07, + "loss": 0.0075, + "num_tokens": 73614293.0, + "reward": 0.5833333432674408, + "reward_std": 0.19135434031486512, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5833333432674408, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2914866417646408, + "sampling/importance_sampling_ratio/max": 1.9210278034210204, + "sampling/importance_sampling_ratio/mean": 0.9998873472213745, + "sampling/importance_sampling_ratio/min": 0.3119745343923569, + "sampling/sampling_logp_difference/max": 1.2648154497146606, + "sampling/sampling_logp_difference/mean": 0.015689116902649403, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1951.2, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1136.20625, + "completions/mean_terminated_length": 1132.3883056640625, + "completions/min_length": 609.6, + "completions/min_terminated_length": 609.6, + "entropy": 0.37793651819229124, + "epoch": 0.6286721504112809, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.7828454971313477, + "learning_rate": 4.508117276472013e-07, + "loss": 0.0106, + "num_tokens": 74277411.0, + "reward": 0.49088543057441714, + "reward_std": 0.1814019948244095, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.49088541865348817, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3006050676107407, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999652862548828, + "sampling/importance_sampling_ratio/min": 0.42095847725868224, + "sampling/sampling_logp_difference/max": 1.4004887104034425, + "sampling/sampling_logp_difference/mean": 0.015673627704381944, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1927.2, + "completions/max_terminated_length": 1927.2, + "completions/mean_length": 1194.4, + "completions/mean_terminated_length": 1183.4655029296875, + "completions/min_length": 722.4, + "completions/min_terminated_length": 722.4, + "entropy": 0.3972707211971283, + "epoch": 0.6345475910693302, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.7965516448020935, + "learning_rate": 4.502059607463048e-07, + "loss": -0.0058, + "num_tokens": 74952935.0, + "reward": 0.49776042699813844, + "reward_std": 0.18791168928146362, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.49776042699813844, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32319254875183107, + "sampling/importance_sampling_ratio/max": 1.9314549446105957, + "sampling/importance_sampling_ratio/mean": 0.9998380184173584, + "sampling/importance_sampling_ratio/min": 0.3149589985609055, + "sampling/sampling_logp_difference/max": 1.28137925863266, + "sampling/sampling_logp_difference/mean": 0.016445842757821082, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2184.8, + "completions/max_terminated_length": 2125.4, + "completions/mean_length": 1200.940625, + "completions/mean_terminated_length": 1196.8129150390625, + "completions/min_length": 651.6, + "completions/min_terminated_length": 651.6, + "entropy": 0.35937982201576235, + "epoch": 0.6404230317273796, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6078236699104309, + "learning_rate": 4.4960019384540825e-07, + "loss": -0.0231, + "num_tokens": 75673568.0, + "reward": 0.5217708289623261, + "reward_std": 0.1847947582602501, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.521770840883255, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3309717208147049, + "sampling/importance_sampling_ratio/max": 1.9890816926956176, + "sampling/importance_sampling_ratio/mean": 0.999953544139862, + "sampling/importance_sampling_ratio/min": 0.2543433949351311, + "sampling/sampling_logp_difference/max": 1.4823894262313844, + "sampling/sampling_logp_difference/mean": 0.015391046553850174, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2021.8, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 1226.0375, + "completions/mean_terminated_length": 1218.3116943359375, + "completions/min_length": 760.8, + "completions/min_terminated_length": 760.8, + "entropy": 0.3644507646560669, + "epoch": 0.6462984723854289, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.7988547086715698, + "learning_rate": 4.4899442694451174e-07, + "loss": -0.01, + "num_tokens": 76352900.0, + "reward": 0.5784375071525574, + "reward_std": 0.19074564576148986, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5784375071525574, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29962594509124757, + "sampling/importance_sampling_ratio/max": 1.9379316568374634, + "sampling/importance_sampling_ratio/mean": 1.0000091791152954, + "sampling/importance_sampling_ratio/min": 0.27861389741301534, + "sampling/sampling_logp_difference/max": 1.5992833375930786, + "sampling/sampling_logp_difference/mean": 0.015281017497181892, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1948.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 1158.328125, + "completions/mean_terminated_length": 1158.328125, + "completions/min_length": 670.8, + "completions/min_terminated_length": 670.8, + "entropy": 0.3680810987949371, + "epoch": 0.6521739130434783, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.7596303820610046, + "learning_rate": 4.4838866004361517e-07, + "loss": 0.0209, + "num_tokens": 77036733.0, + "reward": 0.6009375095367432, + "reward_std": 0.20273202657699585, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6009375214576721, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2976983994245529, + "sampling/importance_sampling_ratio/max": 1.8425412893295288, + "sampling/importance_sampling_ratio/mean": 0.9999056339263916, + "sampling/importance_sampling_ratio/min": 0.365853750705719, + "sampling/sampling_logp_difference/max": 1.008244252204895, + "sampling/sampling_logp_difference/mean": 0.015644372254610062, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1914.8, + "completions/max_terminated_length": 1914.8, + "completions/mean_length": 1163.9125, + "completions/mean_terminated_length": 1163.9125, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.34467315673828125, + "epoch": 0.6580493537015276, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.5723123550415039, + "learning_rate": 4.4778289314271866e-07, + "loss": 0.0259, + "num_tokens": 77729553.0, + "reward": 0.5305729269981384, + "reward_std": 0.15761651396751403, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5305729269981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3086104154586792, + "sampling/importance_sampling_ratio/max": 1.9935364246368408, + "sampling/importance_sampling_ratio/mean": 0.9999402403831482, + "sampling/importance_sampling_ratio/min": 0.3442982375621796, + "sampling/sampling_logp_difference/max": 1.0778313636779786, + "sampling/sampling_logp_difference/mean": 0.014830858074128627, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1942.2, + "completions/max_terminated_length": 1942.2, + "completions/mean_length": 1147.51875, + "completions/mean_terminated_length": 1147.51875, + "completions/min_length": 760.6, + "completions/min_terminated_length": 760.6, + "entropy": 0.3442341387271881, + "epoch": 0.663924794359577, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6711921095848083, + "learning_rate": 4.4717712624182215e-07, + "loss": 0.0053, + "num_tokens": 78408247.0, + "reward": 0.5476041793823242, + "reward_std": 0.14775162786245347, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5476041913032532, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29104921221733093, + "sampling/importance_sampling_ratio/max": 1.9945608377456665, + "sampling/importance_sampling_ratio/mean": 1.0000170111656188, + "sampling/importance_sampling_ratio/min": 0.41033602952957154, + "sampling/sampling_logp_difference/max": 0.9867128372192383, + "sampling/sampling_logp_difference/mean": 0.014713744446635247, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1990.6, + "completions/max_terminated_length": 1891.4, + "completions/mean_length": 1251.853125, + "completions/mean_terminated_length": 1247.9536865234375, + "completions/min_length": 734.2, + "completions/min_terminated_length": 734.2, + "entropy": 0.36719573736190797, + "epoch": 0.6698002350176263, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8351427912712097, + "learning_rate": 4.465713593409256e-07, + "loss": 0.0039, + "num_tokens": 79119076.0, + "reward": 0.5445312678813934, + "reward_std": 0.21431024968624116, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5445312678813934, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29993789792060854, + "sampling/importance_sampling_ratio/max": 1.9819918394088745, + "sampling/importance_sampling_ratio/mean": 1.0000818252563477, + "sampling/importance_sampling_ratio/min": 0.353739058971405, + "sampling/sampling_logp_difference/max": 1.094305396080017, + "sampling/sampling_logp_difference/mean": 0.015493137948215007, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1775.6, + "completions/max_terminated_length": 1775.6, + "completions/mean_length": 1123.984375, + "completions/mean_terminated_length": 1123.984375, + "completions/min_length": 751.6, + "completions/min_terminated_length": 751.6, + "entropy": 0.3667173147201538, + "epoch": 0.6756756756756757, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6817618608474731, + "learning_rate": 4.459655924400291e-07, + "loss": 0.0128, + "num_tokens": 79775567.0, + "reward": 0.6600000143051148, + "reward_std": 0.1561010330915451, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6600000143051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27142361700534823, + "sampling/importance_sampling_ratio/max": 1.961619234085083, + "sampling/importance_sampling_ratio/mean": 1.0000163555145263, + "sampling/importance_sampling_ratio/min": 0.36026729345321656, + "sampling/sampling_logp_difference/max": 1.0491438388824463, + "sampling/sampling_logp_difference/mean": 0.01550084501504898, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1770.6, + "completions/max_terminated_length": 1770.6, + "completions/mean_length": 1138.36875, + "completions/mean_terminated_length": 1138.36875, + "completions/min_length": 691.6, + "completions/min_terminated_length": 691.6, + "entropy": 0.3702571511268616, + "epoch": 0.681551116333725, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.885202944278717, + "learning_rate": 4.453598255391325e-07, + "loss": 0.008, + "num_tokens": 80422453.0, + "reward": 0.5227083444595337, + "reward_std": 0.1915093831717968, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5227083444595337, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2948000729084015, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999974846839905, + "sampling/importance_sampling_ratio/min": 0.41094982624053955, + "sampling/sampling_logp_difference/max": 1.0436588287353517, + "sampling/sampling_logp_difference/mean": 0.01540203858166933, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1851.2, + "completions/max_terminated_length": 1843.6, + "completions/mean_length": 1159.803125, + "completions/mean_terminated_length": 1152.411279296875, + "completions/min_length": 670.4, + "completions/min_terminated_length": 670.4, + "entropy": 0.3673887014389038, + "epoch": 0.6874265569917744, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.6437573432922363, + "learning_rate": 4.44754058638236e-07, + "loss": -0.006, + "num_tokens": 81093870.0, + "reward": 0.6208333611488343, + "reward_std": 0.15290768593549728, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6208333611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2849018633365631, + "sampling/importance_sampling_ratio/max": 1.8680822134017945, + "sampling/importance_sampling_ratio/mean": 1.0000381112098693, + "sampling/importance_sampling_ratio/min": 0.378055739402771, + "sampling/sampling_logp_difference/max": 1.003414511680603, + "sampling/sampling_logp_difference/mean": 0.015409078076481819, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1771.8, + "completions/max_terminated_length": 1771.8, + "completions/mean_length": 1149.809375, + "completions/mean_terminated_length": 1149.809375, + "completions/min_length": 723.2, + "completions/min_terminated_length": 723.2, + "entropy": 0.36604792475700376, + "epoch": 0.6933019976498237, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.9135239720344543, + "learning_rate": 4.441482917373395e-07, + "loss": 0.0199, + "num_tokens": 81789857.0, + "reward": 0.6539062619209289, + "reward_std": 0.19954511225223542, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6539062619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24059314727783204, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000005555152893, + "sampling/importance_sampling_ratio/min": 0.4236325442790985, + "sampling/sampling_logp_difference/max": 1.108893585205078, + "sampling/sampling_logp_difference/mean": 0.015483367443084716, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1860.4, + "completions/max_terminated_length": 1860.4, + "completions/mean_length": 1070.809375, + "completions/mean_terminated_length": 1070.809375, + "completions/min_length": 663.2, + "completions/min_terminated_length": 663.2, + "entropy": 0.34452658891677856, + "epoch": 0.699177438307873, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.9200600981712341, + "learning_rate": 4.4354252483644293e-07, + "loss": 0.0155, + "num_tokens": 82480708.0, + "reward": 0.5846354305744171, + "reward_std": 0.17398979663848876, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5846354305744171, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3412102937698364, + "sampling/importance_sampling_ratio/max": 1.9801399946212768, + "sampling/importance_sampling_ratio/mean": 0.9999470949172974, + "sampling/importance_sampling_ratio/min": 0.43194087147712706, + "sampling/sampling_logp_difference/max": 1.0336421251296997, + "sampling/sampling_logp_difference/mean": 0.014913180842995644, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2101.6, + "completions/max_terminated_length": 2073.4, + "completions/mean_length": 1216.24375, + "completions/mean_terminated_length": 1208.71279296875, + "completions/min_length": 721.6, + "completions/min_terminated_length": 721.6, + "entropy": 0.3706329584121704, + "epoch": 0.7050528789659224, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7335398197174072, + "learning_rate": 4.4293675793554636e-07, + "loss": -0.0294, + "num_tokens": 83165354.0, + "reward": 0.600885421037674, + "reward_std": 0.15805183053016664, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.600885421037674, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2339630365371704, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000630497932435, + "sampling/importance_sampling_ratio/min": 0.29682928044348955, + "sampling/sampling_logp_difference/max": 1.9509897232055664, + "sampling/sampling_logp_difference/mean": 0.015584040805697441, + "step": 600 + }, + { + "epoch": 0.7050528789659224, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.00125, + "eval_completions/max_length": 1885.44, + "eval_completions/max_terminated_length": 1822.08, + "eval_completions/mean_length": 1111.4675, + "eval_completions/mean_terminated_length": 1109.69341796875, + "eval_completions/min_length": 681.28, + "eval_completions/min_terminated_length": 681.28, + "eval_entropy": 0.35809629917144775, + "eval_frac_reward_zero_std": 0.2, + "eval_loss": 0.0031391119118779898, + "eval_num_tokens": 83165354.0, + "eval_reward": 0.5702604281902314, + "eval_reward_std": 0.18180499017238616, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.5702604305744171, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31943190515041353, + "eval_runtime": 484.4398, + "eval_samples_per_second": 0.206, + "eval_sampling/importance_sampling_ratio/max": 1.9421213483810424, + "eval_sampling/importance_sampling_ratio/mean": 1.000009124279022, + "eval_sampling/importance_sampling_ratio/min": 0.32641365081071855, + "eval_sampling/sampling_logp_difference/max": 1.2518345785140992, + "eval_sampling/sampling_logp_difference/mean": 0.015406711027026176, + "eval_steps_per_second": 0.004, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1916.6, + "completions/max_terminated_length": 1916.6, + "completions/mean_length": 1208.140625, + "completions/mean_terminated_length": 1208.140625, + "completions/min_length": 767.4, + "completions/min_terminated_length": 767.4, + "entropy": 0.36099555492401125, + "epoch": 0.7109283196239718, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.9186404943466187, + "learning_rate": 4.423309910346498e-07, + "loss": 0.0137, + "num_tokens": 83870567.0, + "reward": 0.6028645992279053, + "reward_std": 0.18433087766170503, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6028645992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27474440932273864, + "sampling/importance_sampling_ratio/max": 1.9217813730239868, + "sampling/importance_sampling_ratio/mean": 1.0000322818756104, + "sampling/importance_sampling_ratio/min": 0.33838503062725067, + "sampling/sampling_logp_difference/max": 1.2144441843032836, + "sampling/sampling_logp_difference/mean": 0.015485203452408313, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1942.8, + "completions/max_terminated_length": 1843.8, + "completions/mean_length": 1060.925, + "completions/mean_terminated_length": 1056.8969482421876, + "completions/min_length": 572.0, + "completions/min_terminated_length": 572.0, + "entropy": 0.3543337404727936, + "epoch": 0.7168037602820212, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.8076752424240112, + "learning_rate": 4.417252241337533e-07, + "loss": -0.0074, + "num_tokens": 84533131.0, + "reward": 0.6001562714576721, + "reward_std": 0.1616973862051964, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6001562714576721, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2991356194019318, + "sampling/importance_sampling_ratio/max": 1.8717167854309082, + "sampling/importance_sampling_ratio/mean": 0.9999955654144287, + "sampling/importance_sampling_ratio/min": 0.38679862320423125, + "sampling/sampling_logp_difference/max": 1.0169011354446411, + "sampling/sampling_logp_difference/mean": 0.015325301699340344, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1790.4, + "completions/max_terminated_length": 1790.4, + "completions/mean_length": 1141.934375, + "completions/mean_terminated_length": 1141.934375, + "completions/min_length": 691.8, + "completions/min_terminated_length": 691.8, + "entropy": 0.36986821293830874, + "epoch": 0.7226792009400705, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7624120712280273, + "learning_rate": 4.411194572328568e-07, + "loss": -0.0142, + "num_tokens": 85203078.0, + "reward": 0.6421875119209289, + "reward_std": 0.15983470678329467, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6421875119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24894966185092926, + "sampling/importance_sampling_ratio/max": 1.9372856378555299, + "sampling/importance_sampling_ratio/mean": 0.9998981475830078, + "sampling/importance_sampling_ratio/min": 0.3545094013214111, + "sampling/sampling_logp_difference/max": 1.174563217163086, + "sampling/sampling_logp_difference/mean": 0.015732382237911225, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1923.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1150.5875, + "completions/mean_terminated_length": 1150.5875, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.3715690791606903, + "epoch": 0.7285546415981199, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.5657132863998413, + "learning_rate": 4.405136903319602e-07, + "loss": 0.0199, + "num_tokens": 85870882.0, + "reward": 0.5806770920753479, + "reward_std": 0.17638582587242127, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5806770920753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3543856680393219, + "sampling/importance_sampling_ratio/max": 1.9489038705825805, + "sampling/importance_sampling_ratio/mean": 1.0000184893608093, + "sampling/importance_sampling_ratio/min": 0.34882347881793974, + "sampling/sampling_logp_difference/max": 1.131108021736145, + "sampling/sampling_logp_difference/mean": 0.015539542399346828, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2070.0, + "completions/max_terminated_length": 1961.2, + "completions/mean_length": 1203.53125, + "completions/mean_terminated_length": 1199.6821533203124, + "completions/min_length": 765.4, + "completions/min_terminated_length": 765.4, + "entropy": 0.364533132314682, + "epoch": 0.7344300822561692, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.6601881980895996, + "learning_rate": 4.399079234310637e-07, + "loss": 0.0201, + "num_tokens": 86598024.0, + "reward": 0.5711979389190673, + "reward_std": 0.18835237622261047, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5711979389190673, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3303461790084839, + "sampling/importance_sampling_ratio/max": 1.9624412298202514, + "sampling/importance_sampling_ratio/mean": 0.9998656749725342, + "sampling/importance_sampling_ratio/min": 0.37342591881752013, + "sampling/sampling_logp_difference/max": 1.0346963167190553, + "sampling/sampling_logp_difference/mean": 0.015363234095275402, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1870.8, + "completions/max_terminated_length": 1870.8, + "completions/mean_length": 1130.415625, + "completions/mean_terminated_length": 1130.415625, + "completions/min_length": 705.6, + "completions/min_terminated_length": 705.6, + "entropy": 0.3572323977947235, + "epoch": 0.7403055229142186, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.4808492064476013, + "learning_rate": 4.3930215653016714e-07, + "loss": 0.0067, + "num_tokens": 87275725.0, + "reward": 0.5479687631130219, + "reward_std": 0.1671212889254093, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5479687631130219, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2871303677558899, + "sampling/importance_sampling_ratio/max": 1.9045668125152588, + "sampling/importance_sampling_ratio/mean": 1.0000359773635865, + "sampling/importance_sampling_ratio/min": 0.390330970287323, + "sampling/sampling_logp_difference/max": 0.9632290363311767, + "sampling/sampling_logp_difference/mean": 0.015123646520078183, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1733.6, + "completions/max_terminated_length": 1733.6, + "completions/mean_length": 1164.278125, + "completions/mean_terminated_length": 1164.278125, + "completions/min_length": 786.2, + "completions/min_terminated_length": 786.2, + "entropy": 0.38033042550086976, + "epoch": 0.7461809635722679, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8040631413459778, + "learning_rate": 4.3869638962927063e-07, + "loss": 0.0232, + "num_tokens": 87977846.0, + "reward": 0.6135416746139526, + "reward_std": 0.17617344856262207, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6135416805744172, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30681161880493163, + "sampling/importance_sampling_ratio/max": 1.8931318521499634, + "sampling/importance_sampling_ratio/mean": 0.9999570846557617, + "sampling/importance_sampling_ratio/min": 0.42780248522758485, + "sampling/sampling_logp_difference/max": 0.8543825387954712, + "sampling/sampling_logp_difference/mean": 0.0160001702606678, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 2039.0, + "completions/max_terminated_length": 1983.8, + "completions/mean_length": 1142.871875, + "completions/mean_terminated_length": 1123.12822265625, + "completions/min_length": 655.6, + "completions/min_terminated_length": 655.6, + "entropy": 0.3652238368988037, + "epoch": 0.7520564042303173, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8171776533126831, + "learning_rate": 4.380906227283741e-07, + "loss": 0.0168, + "num_tokens": 88659353.0, + "reward": 0.645781260728836, + "reward_std": 0.16527300626039504, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6457812786102295, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32556850016117095, + "sampling/importance_sampling_ratio/max": 1.9582969903945924, + "sampling/importance_sampling_ratio/mean": 0.999891209602356, + "sampling/importance_sampling_ratio/min": 0.3786491721868515, + "sampling/sampling_logp_difference/max": 1.0249874234199523, + "sampling/sampling_logp_difference/mean": 0.015227846615016461, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1975.0, + "completions/max_terminated_length": 1815.6, + "completions/mean_length": 1169.203125, + "completions/mean_terminated_length": 1165.0383544921874, + "completions/min_length": 741.8, + "completions/min_terminated_length": 741.8, + "entropy": 0.3585414707660675, + "epoch": 0.7579318448883666, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8918953537940979, + "learning_rate": 4.3748485582747756e-07, + "loss": 0.0061, + "num_tokens": 89353142.0, + "reward": 0.5856250166893006, + "reward_std": 0.22155131995677949, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5856250107288361, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2939130663871765, + "sampling/importance_sampling_ratio/max": 1.9639912843704224, + "sampling/importance_sampling_ratio/mean": 0.9999972224235535, + "sampling/importance_sampling_ratio/min": 0.37892901003360746, + "sampling/sampling_logp_difference/max": 0.9988775253295898, + "sampling/sampling_logp_difference/mean": 0.014916538074612618, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2024.8, + "completions/max_terminated_length": 2024.8, + "completions/mean_length": 1198.5, + "completions/mean_terminated_length": 1198.5, + "completions/min_length": 787.4, + "completions/min_terminated_length": 787.4, + "entropy": 0.3784613788127899, + "epoch": 0.763807285546416, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.806064248085022, + "learning_rate": 4.3687908892658105e-07, + "loss": 0.001, + "num_tokens": 90064694.0, + "reward": 0.6430729389190674, + "reward_std": 0.19357036650180817, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6430729389190674, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31154467463493346, + "sampling/importance_sampling_ratio/max": 1.9199378967285157, + "sampling/importance_sampling_ratio/mean": 0.9998810887336731, + "sampling/importance_sampling_ratio/min": 0.29518369734287264, + "sampling/sampling_logp_difference/max": 1.2997811555862426, + "sampling/sampling_logp_difference/mean": 0.0156770009547472, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1619.2, + "completions/max_terminated_length": 1619.2, + "completions/mean_length": 1067.253125, + "completions/mean_terminated_length": 1067.253125, + "completions/min_length": 648.0, + "completions/min_terminated_length": 648.0, + "entropy": 0.3725395202636719, + "epoch": 0.7696827262044653, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.9183342456817627, + "learning_rate": 4.3627332202568454e-07, + "loss": 0.0011, + "num_tokens": 90727559.0, + "reward": 0.5644791722297668, + "reward_std": 0.1635679453611374, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5644791722297668, + "rewards/e2e_recall_precision_mixed_reward/std": 0.37316291928291323, + "sampling/importance_sampling_ratio/max": 1.9169376850128175, + "sampling/importance_sampling_ratio/mean": 0.9999460935592651, + "sampling/importance_sampling_ratio/min": 0.3552225947380066, + "sampling/sampling_logp_difference/max": 1.0906750798225402, + "sampling/sampling_logp_difference/mean": 0.015604752302169799, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1765.0, + "completions/max_terminated_length": 1765.0, + "completions/mean_length": 1024.040625, + "completions/mean_terminated_length": 1024.040625, + "completions/min_length": 587.4, + "completions/min_terminated_length": 587.4, + "entropy": 0.3731545150279999, + "epoch": 0.7755581668625147, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.8010883927345276, + "learning_rate": 4.3566755512478797e-07, + "loss": -0.0083, + "num_tokens": 91353300.0, + "reward": 0.6674479246139526, + "reward_std": 0.1368040680885315, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6674479246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3341027498245239, + "sampling/importance_sampling_ratio/max": 1.926101303100586, + "sampling/importance_sampling_ratio/mean": 0.9999330401420593, + "sampling/importance_sampling_ratio/min": 0.28611020296812056, + "sampling/sampling_logp_difference/max": 1.6566476583480836, + "sampling/sampling_logp_difference/mean": 0.01547999307513237, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1819.4, + "completions/max_terminated_length": 1703.6, + "completions/mean_length": 1121.246875, + "completions/mean_terminated_length": 1117.4317626953125, + "completions/min_length": 722.4, + "completions/min_terminated_length": 722.4, + "entropy": 0.3596222817897797, + "epoch": 0.781433607520564, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.37091967463493347, + "learning_rate": 4.3506178822389146e-07, + "loss": 0.0105, + "num_tokens": 92051055.0, + "reward": 0.5822916746139526, + "reward_std": 0.14248881340026856, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5822916805744172, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28063163459300994, + "sampling/importance_sampling_ratio/max": 1.9675161600112916, + "sampling/importance_sampling_ratio/mean": 1.000024175643921, + "sampling/importance_sampling_ratio/min": 0.34341188669204714, + "sampling/sampling_logp_difference/max": 1.362108290195465, + "sampling/sampling_logp_difference/mean": 0.015318811498582363, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1901.6, + "completions/max_terminated_length": 1901.6, + "completions/mean_length": 1049.240625, + "completions/mean_terminated_length": 1049.240625, + "completions/min_length": 705.2, + "completions/min_terminated_length": 705.2, + "entropy": 0.3371972322463989, + "epoch": 0.7873090481786134, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.9466339945793152, + "learning_rate": 4.344560213229949e-07, + "loss": 0.0087, + "num_tokens": 92731788.0, + "reward": 0.621875, + "reward_std": 0.17843145430088042, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.621875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3395528972148895, + "sampling/importance_sampling_ratio/max": 1.868542456626892, + "sampling/importance_sampling_ratio/mean": 1.0001242876052856, + "sampling/importance_sampling_ratio/min": 0.31800718009471896, + "sampling/sampling_logp_difference/max": 1.1785770416259767, + "sampling/sampling_logp_difference/mean": 0.014529785700142383, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1871.6, + "completions/max_terminated_length": 1871.6, + "completions/mean_length": 1122.5875, + "completions/mean_terminated_length": 1122.5875, + "completions/min_length": 654.0, + "completions/min_terminated_length": 654.0, + "entropy": 0.3579367399215698, + "epoch": 0.7931844888366627, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.8181319832801819, + "learning_rate": 4.338502544220984e-07, + "loss": 0.0265, + "num_tokens": 93410776.0, + "reward": 0.5313541889190674, + "reward_std": 0.16972679942846297, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5313541829586029, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28352001309394836, + "sampling/importance_sampling_ratio/max": 1.973141074180603, + "sampling/importance_sampling_ratio/mean": 1.0001737236976624, + "sampling/importance_sampling_ratio/min": 0.4472075402736664, + "sampling/sampling_logp_difference/max": 0.917256760597229, + "sampling/sampling_logp_difference/mean": 0.015095295198261739, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1637.2, + "completions/max_terminated_length": 1637.2, + "completions/mean_length": 1013.86875, + "completions/mean_terminated_length": 1013.86875, + "completions/min_length": 630.2, + "completions/min_terminated_length": 630.2, + "entropy": 0.34675438404083253, + "epoch": 0.799059929494712, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.9746776819229126, + "learning_rate": 4.3324448752120177e-07, + "loss": -0.0082, + "num_tokens": 94042670.0, + "reward": 0.6797916769981385, + "reward_std": 0.14540175199508668, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6797916769981385, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31566589772701265, + "sampling/importance_sampling_ratio/max": 1.8168800592422485, + "sampling/importance_sampling_ratio/mean": 0.9998704791069031, + "sampling/importance_sampling_ratio/min": 0.343018639087677, + "sampling/sampling_logp_difference/max": 1.1331130743026734, + "sampling/sampling_logp_difference/mean": 0.014893803745508194, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1943.6, + "completions/max_terminated_length": 1943.6, + "completions/mean_length": 1118.8375, + "completions/mean_terminated_length": 1118.8375, + "completions/min_length": 641.8, + "completions/min_terminated_length": 641.8, + "entropy": 0.3540935754776001, + "epoch": 0.8049353701527615, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.9660540223121643, + "learning_rate": 4.3263872062030526e-07, + "loss": -0.0026, + "num_tokens": 94725594.0, + "reward": 0.6530729293823242, + "reward_std": 0.10995356068015098, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6530729293823242, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2734446346759796, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000356674194335, + "sampling/importance_sampling_ratio/min": 0.2624353013234213, + "sampling/sampling_logp_difference/max": 2.294406795501709, + "sampling/sampling_logp_difference/mean": 0.015182534791529178, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1589.6, + "completions/max_terminated_length": 1589.6, + "completions/mean_length": 1034.95, + "completions/mean_terminated_length": 1034.95, + "completions/min_length": 558.2, + "completions/min_terminated_length": 558.2, + "entropy": 0.3575005650520325, + "epoch": 0.8108108108108109, + "frac_reward_zero_std": 0.15, + "grad_norm": 1.0704731941223145, + "learning_rate": 4.3203295371940875e-07, + "loss": 0.0023, + "num_tokens": 95370026.0, + "reward": 0.6488541722297668, + "reward_std": 0.19183206856250762, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6488541722297668, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3406189620494843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000473976135253, + "sampling/importance_sampling_ratio/min": 0.361629045009613, + "sampling/sampling_logp_difference/max": 1.0578797578811645, + "sampling/sampling_logp_difference/mean": 0.015206655859947205, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1792.8, + "completions/max_terminated_length": 1792.8, + "completions/mean_length": 1117.090625, + "completions/mean_terminated_length": 1117.090625, + "completions/min_length": 713.0, + "completions/min_terminated_length": 713.0, + "entropy": 0.3441681444644928, + "epoch": 0.8166862514688602, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.8466465473175049, + "learning_rate": 4.314271868185122e-07, + "loss": 0.0066, + "num_tokens": 96058743.0, + "reward": 0.6003645956516266, + "reward_std": 0.21395085752010345, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6003645956516266, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3233032822608948, + "sampling/importance_sampling_ratio/max": 1.8979068994522095, + "sampling/importance_sampling_ratio/mean": 1.0000794172286986, + "sampling/importance_sampling_ratio/min": 0.38034394979476926, + "sampling/sampling_logp_difference/max": 1.0374782085418701, + "sampling/sampling_logp_difference/mean": 0.01497993227094412, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1584.2, + "completions/max_terminated_length": 1584.2, + "completions/mean_length": 1046.925, + "completions/mean_terminated_length": 1046.925, + "completions/min_length": 636.6, + "completions/min_terminated_length": 636.6, + "entropy": 0.3259464383125305, + "epoch": 0.8225616921269095, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.7266973853111267, + "learning_rate": 4.308214199176157e-07, + "loss": 0.0302, + "num_tokens": 96709727.0, + "reward": 0.6774479508399963, + "reward_std": 0.1927065670490265, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6774479389190674, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2997411698102951, + "sampling/importance_sampling_ratio/max": 1.8562868118286133, + "sampling/importance_sampling_ratio/mean": 0.9998578786849975, + "sampling/importance_sampling_ratio/min": 0.28235826790332796, + "sampling/sampling_logp_difference/max": 1.3985157489776612, + "sampling/sampling_logp_difference/mean": 0.014192759618163108, + "step": 700 + }, + { + "epoch": 0.8225616921269095, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1574.08, + "eval_completions/max_terminated_length": 1574.08, + "eval_completions/mean_length": 1028.64625, + "eval_completions/mean_terminated_length": 1028.64625, + "eval_completions/min_length": 622.04, + "eval_completions/min_terminated_length": 622.04, + "eval_entropy": 0.33704278111457825, + "eval_frac_reward_zero_std": 0.25, + "eval_loss": 0.0007254349184222519, + "eval_num_tokens": 96709727.0, + "eval_reward": 0.6012395948171616, + "eval_reward_std": 0.1713283321261406, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6012395972013473, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3278186982870102, + "eval_runtime": 422.8508, + "eval_samples_per_second": 0.236, + "eval_sampling/importance_sampling_ratio/max": 1.9145189142227172, + "eval_sampling/importance_sampling_ratio/mean": 1.000025644302368, + "eval_sampling/importance_sampling_ratio/min": 0.3480164834856987, + "eval_sampling/sampling_logp_difference/max": 1.2185939931869507, + "eval_sampling/sampling_logp_difference/mean": 0.014661024622619152, + "eval_steps_per_second": 0.005, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1610.8, + "completions/max_terminated_length": 1610.8, + "completions/mean_length": 1018.06875, + "completions/mean_terminated_length": 1018.06875, + "completions/min_length": 606.0, + "completions/min_terminated_length": 606.0, + "entropy": 0.3368810176849365, + "epoch": 0.8284371327849589, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8269535899162292, + "learning_rate": 4.3021565301671916e-07, + "loss": -0.0007, + "num_tokens": 97340021.0, + "reward": 0.6871875166893006, + "reward_std": 0.15638408213853836, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6871875166893006, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27981995344161986, + "sampling/importance_sampling_ratio/max": 1.813429832458496, + "sampling/importance_sampling_ratio/mean": 0.9999094605445862, + "sampling/importance_sampling_ratio/min": 0.4124268352985382, + "sampling/sampling_logp_difference/max": 0.9072500944137574, + "sampling/sampling_logp_difference/mean": 0.014728988707065582, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1579.2, + "completions/max_terminated_length": 1500.6, + "completions/mean_length": 974.878125, + "completions/mean_terminated_length": 965.8274169921875, + "completions/min_length": 631.2, + "completions/min_terminated_length": 631.2, + "entropy": 0.32004879117012025, + "epoch": 0.8343125734430082, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7339820265769958, + "learning_rate": 4.296098861158226e-07, + "loss": -0.0021, + "num_tokens": 97975014.0, + "reward": 0.7150000095367431, + "reward_std": 0.12368128597736358, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7150000214576722, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2955649644136429, + "sampling/importance_sampling_ratio/max": 1.9696100234985352, + "sampling/importance_sampling_ratio/mean": 0.9999983787536622, + "sampling/importance_sampling_ratio/min": 0.3940297782421112, + "sampling/sampling_logp_difference/max": 0.9819095611572266, + "sampling/sampling_logp_difference/mean": 0.014322330988943578, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1625.6, + "completions/max_terminated_length": 1625.6, + "completions/mean_length": 1042.034375, + "completions/mean_terminated_length": 1042.034375, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.3161555349826813, + "epoch": 0.8401880141010576, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8281562924385071, + "learning_rate": 4.290041192149261e-07, + "loss": 0.003, + "num_tokens": 98647761.0, + "reward": 0.6562500119209289, + "reward_std": 0.1516960322856903, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6562500119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31697266101837157, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999948978424072, + "sampling/importance_sampling_ratio/min": 0.34369261264801027, + "sampling/sampling_logp_difference/max": 1.1635895490646362, + "sampling/sampling_logp_difference/mean": 0.014115219749510289, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1750.6, + "completions/max_terminated_length": 1666.6, + "completions/mean_length": 1037.1875, + "completions/mean_terminated_length": 1027.6480834960937, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "entropy": 0.3309583842754364, + "epoch": 0.8460634547591069, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.9976007342338562, + "learning_rate": 4.283983523140295e-07, + "loss": -0.0162, + "num_tokens": 99314501.0, + "reward": 0.6807812452316284, + "reward_std": 0.1338688015937805, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6807812452316284, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3191369116306305, + "sampling/importance_sampling_ratio/max": 1.9326151609420776, + "sampling/importance_sampling_ratio/mean": 0.9999658942222596, + "sampling/importance_sampling_ratio/min": 0.4041714251041412, + "sampling/sampling_logp_difference/max": 1.0760185718536377, + "sampling/sampling_logp_difference/mean": 0.01456475555896759, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1600.4, + "completions/max_terminated_length": 1600.4, + "completions/mean_length": 1046.459375, + "completions/mean_terminated_length": 1046.459375, + "completions/min_length": 649.2, + "completions/min_terminated_length": 649.2, + "entropy": 0.3314117074012756, + "epoch": 0.8519388954171563, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.4570290446281433, + "learning_rate": 4.27792585413133e-07, + "loss": 0.0001, + "num_tokens": 99966632.0, + "reward": 0.576354193687439, + "reward_std": 0.14871049374341966, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.576354193687439, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3146588295698166, + "sampling/importance_sampling_ratio/max": 1.9622956752777099, + "sampling/importance_sampling_ratio/mean": 0.9999426603317261, + "sampling/importance_sampling_ratio/min": 0.41331798434257505, + "sampling/sampling_logp_difference/max": 0.8975541591644287, + "sampling/sampling_logp_difference/mean": 0.014644469693303108, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1559.8, + "completions/max_terminated_length": 1559.8, + "completions/mean_length": 1027.846875, + "completions/mean_terminated_length": 1027.846875, + "completions/min_length": 620.4, + "completions/min_terminated_length": 620.4, + "entropy": 0.3280536949634552, + "epoch": 0.8578143360752056, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.6543668508529663, + "learning_rate": 4.271868185122365e-07, + "loss": 0.0029, + "num_tokens": 100642375.0, + "reward": 0.54197918176651, + "reward_std": 0.17115794867277145, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.54197918176651, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36070212721824646, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000412940979004, + "sampling/importance_sampling_ratio/min": 0.3607532560825348, + "sampling/sampling_logp_difference/max": 1.123932695388794, + "sampling/sampling_logp_difference/mean": 0.014646673388779164, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1375.8, + "completions/max_terminated_length": 1375.8, + "completions/mean_length": 977.796875, + "completions/mean_terminated_length": 977.796875, + "completions/min_length": 622.4, + "completions/min_terminated_length": 622.4, + "entropy": 0.31903391480445864, + "epoch": 0.863689776733255, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.9127265810966492, + "learning_rate": 4.2658105161133994e-07, + "loss": -0.004, + "num_tokens": 101287654.0, + "reward": 0.6091145873069763, + "reward_std": 0.14868417531251907, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6091145873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2740251809358597, + "sampling/importance_sampling_ratio/max": 1.9419521808624267, + "sampling/importance_sampling_ratio/mean": 1.0001082062721252, + "sampling/importance_sampling_ratio/min": 0.39746988415718076, + "sampling/sampling_logp_difference/max": 1.070701003074646, + "sampling/sampling_logp_difference/mean": 0.014321417547762394, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1902.4, + "completions/max_terminated_length": 1772.4, + "completions/mean_length": 1019.028125, + "completions/mean_terminated_length": 1014.390185546875, + "completions/min_length": 597.6, + "completions/min_terminated_length": 597.6, + "entropy": 0.2945869266986847, + "epoch": 0.8695652173913043, + "frac_reward_zero_std": 0.05, + "grad_norm": 1.0506489276885986, + "learning_rate": 4.2597528471044343e-07, + "loss": 0.0036, + "num_tokens": 101951611.0, + "reward": 0.6753125071525574, + "reward_std": 0.19040150046348572, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6753125190734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29441867470741273, + "sampling/importance_sampling_ratio/max": 1.9205948114395142, + "sampling/importance_sampling_ratio/mean": 0.9999702334403991, + "sampling/importance_sampling_ratio/min": 0.2421664908528328, + "sampling/sampling_logp_difference/max": 1.5547382235527039, + "sampling/sampling_logp_difference/mean": 0.013843800313770771, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1479.0, + "completions/max_terminated_length": 1479.0, + "completions/mean_length": 976.290625, + "completions/mean_terminated_length": 976.290625, + "completions/min_length": 609.4, + "completions/min_terminated_length": 609.4, + "entropy": 0.2982940495014191, + "epoch": 0.8754406580493537, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7624930143356323, + "learning_rate": 4.2536951780954687e-07, + "loss": 0.0197, + "num_tokens": 102593608.0, + "reward": 0.6481770992279052, + "reward_std": 0.16066179424524307, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6481770992279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28471835851669314, + "sampling/importance_sampling_ratio/max": 1.9874407529830933, + "sampling/importance_sampling_ratio/mean": 0.9999994158744812, + "sampling/importance_sampling_ratio/min": 0.4114152193069458, + "sampling/sampling_logp_difference/max": 0.9713044166564941, + "sampling/sampling_logp_difference/mean": 0.013807599991559982, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1364.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 979.490625, + "completions/mean_terminated_length": 979.490625, + "completions/min_length": 616.0, + "completions/min_terminated_length": 616.0, + "entropy": 0.29739575982093813, + "epoch": 0.881316098707403, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.9040115475654602, + "learning_rate": 4.2476375090865036e-07, + "loss": 0.0015, + "num_tokens": 103239925.0, + "reward": 0.5628646016120911, + "reward_std": 0.14127539545297624, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5628646016120911, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3246196061372757, + "sampling/importance_sampling_ratio/max": 1.9362622261047364, + "sampling/importance_sampling_ratio/mean": 0.9999957323074341, + "sampling/importance_sampling_ratio/min": 0.45249093174934385, + "sampling/sampling_logp_difference/max": 0.9302958488464356, + "sampling/sampling_logp_difference/mean": 0.013709403574466705, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 1269.0, + "completions/mean_length": 941.628125, + "completions/mean_terminated_length": 941.628125, + "completions/min_length": 668.6, + "completions/min_terminated_length": 668.6, + "entropy": 0.2873253464698792, + "epoch": 0.8871915393654524, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7270746231079102, + "learning_rate": 4.2415798400775385e-07, + "loss": -0.0041, + "num_tokens": 103848574.0, + "reward": 0.6278125166893005, + "reward_std": 0.13141954243183135, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6278125166893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20140623152256013, + "sampling/importance_sampling_ratio/max": 1.9731908082962035, + "sampling/importance_sampling_ratio/mean": 0.9999493479728698, + "sampling/importance_sampling_ratio/min": 0.29442899525165556, + "sampling/sampling_logp_difference/max": 1.5388960361480712, + "sampling/sampling_logp_difference/mean": 0.013377194851636886, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1579.2, + "completions/max_terminated_length": 1468.4, + "completions/mean_length": 1000.2125, + "completions/mean_terminated_length": 995.6033935546875, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.28227120637893677, + "epoch": 0.8930669800235017, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8500383496284485, + "learning_rate": 4.2355221710685723e-07, + "loss": 0.0123, + "num_tokens": 104456958.0, + "reward": 0.6193229198455811, + "reward_std": 0.2092406004667282, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.61932293176651, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31036264896392823, + "sampling/importance_sampling_ratio/max": 1.9621197700500488, + "sampling/importance_sampling_ratio/mean": 0.9999842524528504, + "sampling/importance_sampling_ratio/min": 0.37887851893901825, + "sampling/sampling_logp_difference/max": 1.138273000717163, + "sampling/sampling_logp_difference/mean": 0.013238861598074437, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 896.709375, + "completions/mean_terminated_length": 896.709375, + "completions/min_length": 568.8, + "completions/min_terminated_length": 568.8, + "entropy": 0.29374930262565613, + "epoch": 0.8989424206815512, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.9952046871185303, + "learning_rate": 4.229464502059607e-07, + "loss": -0.006, + "num_tokens": 105078945.0, + "reward": 0.6104687571525573, + "reward_std": 0.16176794916391374, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6104687750339508, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2711296409368515, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000908970832825, + "sampling/importance_sampling_ratio/min": 0.32102622333914044, + "sampling/sampling_logp_difference/max": 2.261718225479126, + "sampling/sampling_logp_difference/mean": 0.01387611273676157, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1570.8, + "completions/max_terminated_length": 1437.8, + "completions/mean_length": 943.59375, + "completions/mean_terminated_length": 938.8944580078125, + "completions/min_length": 564.8, + "completions/min_terminated_length": 564.8, + "entropy": 0.281459829211235, + "epoch": 0.9048178613396005, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8672675490379333, + "learning_rate": 4.2234068330506415e-07, + "loss": 0.0008, + "num_tokens": 105700507.0, + "reward": 0.5471354305744172, + "reward_std": 0.19499558210372925, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5471354305744172, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3031169354915619, + "sampling/importance_sampling_ratio/max": 1.986432147026062, + "sampling/importance_sampling_ratio/mean": 0.999950909614563, + "sampling/importance_sampling_ratio/min": 0.22982776015996934, + "sampling/sampling_logp_difference/max": 1.6226904392242432, + "sampling/sampling_logp_difference/mean": 0.013240163028240205, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1298.8, + "completions/max_terminated_length": 1298.8, + "completions/mean_length": 922.871875, + "completions/mean_terminated_length": 922.871875, + "completions/min_length": 498.8, + "completions/min_terminated_length": 498.8, + "entropy": 0.2916738152503967, + "epoch": 0.9106933019976499, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.9539207220077515, + "learning_rate": 4.2173491640416764e-07, + "loss": 0.0091, + "num_tokens": 106338834.0, + "reward": 0.5891145884990692, + "reward_std": 0.19707020074129106, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5891146004199982, + "rewards/e2e_recall_precision_mixed_reward/std": 0.347181960940361, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999728560447693, + "sampling/importance_sampling_ratio/min": 0.4203726887702942, + "sampling/sampling_logp_difference/max": 1.1795375108718873, + "sampling/sampling_logp_difference/mean": 0.013758598454296589, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1438.6, + "completions/max_terminated_length": 1438.6, + "completions/mean_length": 934.446875, + "completions/mean_terminated_length": 934.446875, + "completions/min_length": 561.4, + "completions/min_terminated_length": 561.4, + "entropy": 0.28705472946166993, + "epoch": 0.9165687426556992, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8895977139472961, + "learning_rate": 4.2112914950327113e-07, + "loss": 0.0087, + "num_tokens": 107003489.0, + "reward": 0.5985937595367432, + "reward_std": 0.12468727231025696, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5985937595367432, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36896389722824097, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000386953353881, + "sampling/importance_sampling_ratio/min": 0.3284500002861023, + "sampling/sampling_logp_difference/max": 1.1372824430465698, + "sampling/sampling_logp_difference/mean": 0.013641990534961224, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1454.8, + "completions/max_terminated_length": 1454.8, + "completions/mean_length": 969.2, + "completions/mean_terminated_length": 969.2, + "completions/min_length": 629.2, + "completions/min_terminated_length": 629.2, + "entropy": 0.280923467874527, + "epoch": 0.9224441833137486, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.6472572088241577, + "learning_rate": 4.2052338260237457e-07, + "loss": 0.0042, + "num_tokens": 107634865.0, + "reward": 0.7227083444595337, + "reward_std": 0.16849884390830994, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7227083444595337, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3031183272600174, + "sampling/importance_sampling_ratio/max": 1.7831587314605712, + "sampling/importance_sampling_ratio/mean": 0.9999613523483276, + "sampling/importance_sampling_ratio/min": 0.37517508268356325, + "sampling/sampling_logp_difference/max": 0.9865343809127808, + "sampling/sampling_logp_difference/mean": 0.013296735659241676, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1424.4, + "completions/max_terminated_length": 1424.4, + "completions/mean_length": 945.209375, + "completions/mean_terminated_length": 945.209375, + "completions/min_length": 593.8, + "completions/min_terminated_length": 593.8, + "entropy": 0.28385655879974364, + "epoch": 0.9283196239717979, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.8925604820251465, + "learning_rate": 4.1991761570147806e-07, + "loss": -0.0085, + "num_tokens": 108281476.0, + "reward": 0.7040104150772095, + "reward_std": 0.17787585258483887, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7040104150772095, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2852801322937012, + "sampling/importance_sampling_ratio/max": 1.9631269216537475, + "sampling/importance_sampling_ratio/mean": 1.000104796886444, + "sampling/importance_sampling_ratio/min": 0.2682963252067566, + "sampling/sampling_logp_difference/max": 1.5255920171737671, + "sampling/sampling_logp_difference/mean": 0.013569644838571548, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1297.4, + "completions/max_terminated_length": 1297.4, + "completions/mean_length": 942.734375, + "completions/mean_terminated_length": 942.734375, + "completions/min_length": 572.8, + "completions/min_terminated_length": 572.8, + "entropy": 0.28979550004005433, + "epoch": 0.9341950646298472, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.667854905128479, + "learning_rate": 4.193118488005815e-07, + "loss": 0.0068, + "num_tokens": 108890495.0, + "reward": 0.5463541865348815, + "reward_std": 0.12234707921743393, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5463541865348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.35137582421302793, + "sampling/importance_sampling_ratio/max": 1.9598425388336183, + "sampling/importance_sampling_ratio/mean": 1.0001339554786681, + "sampling/importance_sampling_ratio/min": 0.37145259976387024, + "sampling/sampling_logp_difference/max": 1.0230091214179993, + "sampling/sampling_logp_difference/mean": 0.01373386587947607, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1364.6, + "completions/max_terminated_length": 1364.6, + "completions/mean_length": 996.2625, + "completions/mean_terminated_length": 996.2625, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.27299547791481016, + "epoch": 0.9400705052878966, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.45995670557022095, + "learning_rate": 4.18706081899685e-07, + "loss": 0.0054, + "num_tokens": 109527747.0, + "reward": 0.5731770873069764, + "reward_std": 0.12794461846351624, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5731770873069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31133740544319155, + "sampling/importance_sampling_ratio/max": 1.9356085777282714, + "sampling/importance_sampling_ratio/mean": 0.9999878287315369, + "sampling/importance_sampling_ratio/min": 0.4067450284957886, + "sampling/sampling_logp_difference/max": 0.9950890302658081, + "sampling/sampling_logp_difference/mean": 0.012918901070952416, + "step": 800 + }, + { + "epoch": 0.9400705052878966, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1378.96, + "eval_completions/max_terminated_length": 1378.96, + "eval_completions/mean_length": 902.095625, + "eval_completions/mean_terminated_length": 902.095625, + "eval_completions/min_length": 568.84, + "eval_completions/min_terminated_length": 568.84, + "eval_entropy": 0.2893510788679123, + "eval_frac_reward_zero_std": 0.3, + "eval_loss": 0.0025282795540988445, + "eval_num_tokens": 109527747.0, + "eval_reward": 0.6336666762828826, + "eval_reward_std": 0.1548786437511444, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6336666780710221, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3238634419441223, + "eval_runtime": 369.8544, + "eval_samples_per_second": 0.27, + "eval_sampling/importance_sampling_ratio/max": 1.9275440073013306, + "eval_sampling/importance_sampling_ratio/mean": 0.9999588131904602, + "eval_sampling/importance_sampling_ratio/min": 0.3401543361693621, + "eval_sampling/sampling_logp_difference/max": 1.2501714992523194, + "eval_sampling/sampling_logp_difference/mean": 0.013715669251978398, + "eval_steps_per_second": 0.005, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1472.6, + "completions/max_terminated_length": 1472.6, + "completions/mean_length": 959.875, + "completions/mean_terminated_length": 959.875, + "completions/min_length": 569.0, + "completions/min_terminated_length": 569.0, + "entropy": 0.30030797719955443, + "epoch": 0.9459459459459459, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8915446996688843, + "learning_rate": 4.181003149987885e-07, + "loss": 0.0076, + "num_tokens": 110157995.0, + "reward": 0.6484375119209289, + "reward_std": 0.16320026628673076, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6484375119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34366492331027987, + "sampling/importance_sampling_ratio/max": 1.9439698219299317, + "sampling/importance_sampling_ratio/mean": 1.0000813722610473, + "sampling/importance_sampling_ratio/min": 0.28806779980659486, + "sampling/sampling_logp_difference/max": 1.340969157218933, + "sampling/sampling_logp_difference/mean": 0.013933260180056094, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.6, + "completions/max_terminated_length": 1385.6, + "completions/mean_length": 937.865625, + "completions/mean_terminated_length": 937.865625, + "completions/min_length": 572.4, + "completions/min_terminated_length": 572.4, + "entropy": 0.2845805108547211, + "epoch": 0.9518213866039953, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8561508655548096, + "learning_rate": 4.174945480978919e-07, + "loss": -0.0071, + "num_tokens": 110759248.0, + "reward": 0.6078125178813935, + "reward_std": 0.1593657538294792, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6078125178813935, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32861425280570983, + "sampling/importance_sampling_ratio/max": 1.9477681398391724, + "sampling/importance_sampling_ratio/mean": 1.000069797039032, + "sampling/importance_sampling_ratio/min": 0.36302418559789656, + "sampling/sampling_logp_difference/max": 1.2235562324523925, + "sampling/sampling_logp_difference/mean": 0.01336444988846779, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1476.8, + "completions/max_terminated_length": 1476.8, + "completions/mean_length": 894.56875, + "completions/mean_terminated_length": 894.56875, + "completions/min_length": 505.6, + "completions/min_terminated_length": 505.6, + "entropy": 0.2862889677286148, + "epoch": 0.9576968272620446, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8311812281608582, + "learning_rate": 4.168887811969954e-07, + "loss": 0.0094, + "num_tokens": 111360918.0, + "reward": 0.6596875190734863, + "reward_std": 0.1476286917924881, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6596875190734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3123969078063965, + "sampling/importance_sampling_ratio/max": 1.830119013786316, + "sampling/importance_sampling_ratio/mean": 0.9999667048454285, + "sampling/importance_sampling_ratio/min": 0.28043819926679137, + "sampling/sampling_logp_difference/max": 1.6224053859710694, + "sampling/sampling_logp_difference/mean": 0.013792328163981437, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1540.8, + "completions/max_terminated_length": 1540.8, + "completions/mean_length": 922.28125, + "completions/mean_terminated_length": 922.28125, + "completions/min_length": 522.4, + "completions/min_terminated_length": 522.4, + "entropy": 0.28678136467933657, + "epoch": 0.963572267920094, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.9352256059646606, + "learning_rate": 4.1628301429609884e-07, + "loss": -0.0143, + "num_tokens": 111999568.0, + "reward": 0.669218772649765, + "reward_std": 0.13827429413795472, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.669218772649765, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29794834554195404, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000133991241455, + "sampling/importance_sampling_ratio/min": 0.31264116019010546, + "sampling/sampling_logp_difference/max": 1.346689224243164, + "sampling/sampling_logp_difference/mean": 0.014031770080327988, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 906.35625, + "completions/mean_terminated_length": 906.35625, + "completions/min_length": 596.6, + "completions/min_terminated_length": 596.6, + "entropy": 0.2764395475387573, + "epoch": 0.9694477085781433, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4546630382537842, + "learning_rate": 4.156772473952023e-07, + "loss": 0.0027, + "num_tokens": 112621858.0, + "reward": 0.6304166793823243, + "reward_std": 0.11817457228899002, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6304166793823243, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29249594509601595, + "sampling/importance_sampling_ratio/max": 1.9534301519393922, + "sampling/importance_sampling_ratio/mean": 1.000028908252716, + "sampling/importance_sampling_ratio/min": 0.37757673263549807, + "sampling/sampling_logp_difference/max": 1.0168023586273194, + "sampling/sampling_logp_difference/mean": 0.013349436223506927, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1652.2, + "completions/max_terminated_length": 1420.2, + "completions/mean_length": 914.15625, + "completions/mean_terminated_length": 909.2442016601562, + "completions/min_length": 581.6, + "completions/min_terminated_length": 581.6, + "entropy": 0.2829986423254013, + "epoch": 0.9753231492361927, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.637842059135437, + "learning_rate": 4.150714804943058e-07, + "loss": -0.0096, + "num_tokens": 113231744.0, + "reward": 0.6623437643051148, + "reward_std": 0.13550456166267394, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6623437643051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31752758026123046, + "sampling/importance_sampling_ratio/max": 1.9561806201934815, + "sampling/importance_sampling_ratio/mean": 0.9999186754226684, + "sampling/importance_sampling_ratio/min": 0.19964413106240678, + "sampling/sampling_logp_difference/max": 5.088151931762695, + "sampling/sampling_logp_difference/mean": 0.013989451713860035, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1466.2, + "completions/max_terminated_length": 1466.2, + "completions/mean_length": 941.5, + "completions/mean_terminated_length": 941.5, + "completions/min_length": 533.6, + "completions/min_terminated_length": 533.6, + "entropy": 0.28082106113433836, + "epoch": 0.981198589894242, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.9239412546157837, + "learning_rate": 4.1446571359340925e-07, + "loss": 0.0049, + "num_tokens": 113838384.0, + "reward": 0.6610416769981384, + "reward_std": 0.1860102355480194, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6610416769981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28068467378616335, + "sampling/importance_sampling_ratio/max": 1.9927521705627442, + "sampling/importance_sampling_ratio/mean": 1.000089454650879, + "sampling/importance_sampling_ratio/min": 0.3198125422000885, + "sampling/sampling_logp_difference/max": 1.2942631721496582, + "sampling/sampling_logp_difference/mean": 0.013557923585176468, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1497.0, + "completions/max_terminated_length": 1497.0, + "completions/mean_length": 976.396875, + "completions/mean_terminated_length": 976.396875, + "completions/min_length": 507.6, + "completions/min_terminated_length": 507.6, + "entropy": 0.2998843610286713, + "epoch": 0.9870740305522914, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7953653335571289, + "learning_rate": 4.1385994669251274e-07, + "loss": -0.0011, + "num_tokens": 114463567.0, + "reward": 0.6831770896911621, + "reward_std": 0.16244979202747345, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6831770896911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28926865458488465, + "sampling/importance_sampling_ratio/max": 1.926166844367981, + "sampling/importance_sampling_ratio/mean": 1.0001335382461547, + "sampling/importance_sampling_ratio/min": 0.40983167886734007, + "sampling/sampling_logp_difference/max": 0.9074871063232421, + "sampling/sampling_logp_difference/mean": 0.014117139205336571, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1299.6, + "completions/max_terminated_length": 1299.6, + "completions/mean_length": 826.9625, + "completions/mean_terminated_length": 826.9625, + "completions/min_length": 363.6, + "completions/min_terminated_length": 363.6, + "entropy": 0.2793893039226532, + "epoch": 0.9929494712103408, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.5206387639045715, + "learning_rate": 4.132541797916161e-07, + "loss": -0.0096, + "num_tokens": 115050275.0, + "reward": 0.7274479269981384, + "reward_std": 0.1434938132762909, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7274479269981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2630143046379089, + "sampling/importance_sampling_ratio/max": 1.9405413150787354, + "sampling/importance_sampling_ratio/mean": 1.000065279006958, + "sampling/importance_sampling_ratio/min": 0.4045724630355835, + "sampling/sampling_logp_difference/max": 0.9253458499908447, + "sampling/sampling_logp_difference/mean": 0.013940737955272198, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1406.2, + "completions/max_terminated_length": 1406.2, + "completions/mean_length": 959.43125, + "completions/mean_terminated_length": 959.43125, + "completions/min_length": 589.2, + "completions/min_terminated_length": 589.2, + "entropy": 0.28270782232284547, + "epoch": 0.9988249118683902, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.46818044781684875, + "learning_rate": 4.126484128907196e-07, + "loss": -0.0087, + "num_tokens": 115656285.0, + "reward": 0.6919270932674408, + "reward_std": 0.12568674832582474, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6919270932674408, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3104670524597168, + "sampling/importance_sampling_ratio/max": 1.931183409690857, + "sampling/importance_sampling_ratio/mean": 1.000103223323822, + "sampling/importance_sampling_ratio/min": 0.36354232132434844, + "sampling/sampling_logp_difference/max": 1.114108383655548, + "sampling/sampling_logp_difference/mean": 0.013803380355238915, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1560.6, + "completions/max_terminated_length": 1560.6, + "completions/mean_length": 1058.790625, + "completions/mean_terminated_length": 1058.790625, + "completions/min_length": 636.6, + "completions/min_terminated_length": 636.6, + "entropy": 0.2910564005374908, + "epoch": 1.0047003525264395, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8775485754013062, + "learning_rate": 4.120426459898231e-07, + "loss": 0.0008, + "num_tokens": 116329418.0, + "reward": 0.6715104341506958, + "reward_std": 0.16097910925745965, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6715104341506958, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2587837889790535, + "sampling/importance_sampling_ratio/max": 1.9636148691177369, + "sampling/importance_sampling_ratio/mean": 1.0000767588615418, + "sampling/importance_sampling_ratio/min": 0.3296537220478058, + "sampling/sampling_logp_difference/max": 1.5510367155075073, + "sampling/sampling_logp_difference/mean": 0.013851667195558548, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1445.8, + "completions/max_terminated_length": 1445.8, + "completions/mean_length": 930.640625, + "completions/mean_terminated_length": 930.640625, + "completions/min_length": 546.2, + "completions/min_terminated_length": 546.2, + "entropy": 0.2729405462741852, + "epoch": 1.0105757931844888, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.7856388688087463, + "learning_rate": 4.1143687908892654e-07, + "loss": -0.0039, + "num_tokens": 116934247.0, + "reward": 0.795677101612091, + "reward_std": 0.19950651228427888, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.795677101612091, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28640814423561095, + "sampling/importance_sampling_ratio/max": 1.9782800912857055, + "sampling/importance_sampling_ratio/mean": 1.0000068426132203, + "sampling/importance_sampling_ratio/min": 0.2158554643392563, + "sampling/sampling_logp_difference/max": 2.2541099786758423, + "sampling/sampling_logp_difference/mean": 0.013442268781363963, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1620.4, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 1019.85625, + "completions/mean_terminated_length": 1012.270751953125, + "completions/min_length": 617.4, + "completions/min_terminated_length": 617.4, + "entropy": 0.2991854906082153, + "epoch": 1.0164512338425382, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.5161347389221191, + "learning_rate": 4.1083111218803003e-07, + "loss": -0.0024, + "num_tokens": 117587905.0, + "reward": 0.5876562565565109, + "reward_std": 0.12923510670661925, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.5876562565565109, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2704192191362381, + "sampling/importance_sampling_ratio/max": 1.9975411891937256, + "sampling/importance_sampling_ratio/mean": 1.000113844871521, + "sampling/importance_sampling_ratio/min": 0.3002059832215309, + "sampling/sampling_logp_difference/max": 1.312308168411255, + "sampling/sampling_logp_difference/mean": 0.014147781021893024, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1463.4, + "completions/max_terminated_length": 1463.4, + "completions/mean_length": 978.790625, + "completions/mean_terminated_length": 978.790625, + "completions/min_length": 589.6, + "completions/min_terminated_length": 589.6, + "entropy": 0.2838165521621704, + "epoch": 1.0223266745005875, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.6190516948699951, + "learning_rate": 4.1022534528713347e-07, + "loss": 0.0094, + "num_tokens": 118222606.0, + "reward": 0.6153125166893005, + "reward_std": 0.1697417378425598, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6153125166893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3247655272483826, + "sampling/importance_sampling_ratio/max": 1.9137526273727417, + "sampling/importance_sampling_ratio/mean": 0.9999405384063721, + "sampling/importance_sampling_ratio/min": 0.33550558388233187, + "sampling/sampling_logp_difference/max": 1.268647813796997, + "sampling/sampling_logp_difference/mean": 0.013852118141949176, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1434.4, + "completions/max_terminated_length": 1434.4, + "completions/mean_length": 970.478125, + "completions/mean_terminated_length": 970.478125, + "completions/min_length": 625.4, + "completions/min_terminated_length": 625.4, + "entropy": 0.28675009608268737, + "epoch": 1.028202115158637, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.9456232786178589, + "learning_rate": 4.0961957838623695e-07, + "loss": -0.0062, + "num_tokens": 118862071.0, + "reward": 0.6469270884990692, + "reward_std": 0.160466830432415, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6469270884990692, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3246330052614212, + "sampling/importance_sampling_ratio/max": 1.9722527265548706, + "sampling/importance_sampling_ratio/mean": 0.999930226802826, + "sampling/importance_sampling_ratio/min": 0.3685579001903534, + "sampling/sampling_logp_difference/max": 1.2024194717407226, + "sampling/sampling_logp_difference/mean": 0.013803689368069172, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1432.0, + "completions/max_terminated_length": 1432.0, + "completions/mean_length": 965.79375, + "completions/mean_terminated_length": 965.79375, + "completions/min_length": 664.2, + "completions/min_terminated_length": 664.2, + "entropy": 0.2727407574653625, + "epoch": 1.0340775558166861, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7059847712516785, + "learning_rate": 4.0901381148534044e-07, + "loss": 0.0025, + "num_tokens": 119470421.0, + "reward": 0.7513020873069763, + "reward_std": 0.1561640739440918, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7513020873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29153428971767426, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999707937240601, + "sampling/importance_sampling_ratio/min": 0.3273321449756622, + "sampling/sampling_logp_difference/max": 1.3830796003341674, + "sampling/sampling_logp_difference/mean": 0.013064392656087876, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1410.0, + "completions/max_terminated_length": 1410.0, + "completions/mean_length": 874.78125, + "completions/mean_terminated_length": 874.78125, + "completions/min_length": 512.2, + "completions/min_terminated_length": 512.2, + "entropy": 0.27989777326583865, + "epoch": 1.0399529964747356, + "frac_reward_zero_std": 0.55, + "grad_norm": 8.109946250915527, + "learning_rate": 4.084080445844439e-07, + "loss": 0.0028, + "num_tokens": 120109087.0, + "reward": 0.612500011920929, + "reward_std": 0.09955177009105683, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.612500011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3614356517791748, + "sampling/importance_sampling_ratio/max": 1.9737717866897584, + "sampling/importance_sampling_ratio/mean": 1.0000153660774231, + "sampling/importance_sampling_ratio/min": 0.4128256618976593, + "sampling/sampling_logp_difference/max": 0.9954976558685302, + "sampling/sampling_logp_difference/mean": 0.013704303652048111, + "step": 885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1469.0, + "completions/max_terminated_length": 1469.0, + "completions/mean_length": 987.765625, + "completions/mean_terminated_length": 987.765625, + "completions/min_length": 618.6, + "completions/min_terminated_length": 618.6, + "entropy": 0.28374720811843873, + "epoch": 1.045828437132785, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6571109890937805, + "learning_rate": 4.0780227768354737e-07, + "loss": 0.0109, + "num_tokens": 120751044.0, + "reward": 0.686614590883255, + "reward_std": 0.12360157519578933, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.686614590883255, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2697263121604919, + "sampling/importance_sampling_ratio/max": 1.9664803504943849, + "sampling/importance_sampling_ratio/mean": 1.0000463843345642, + "sampling/importance_sampling_ratio/min": 0.2772391699254513, + "sampling/sampling_logp_difference/max": 1.5622613430023193, + "sampling/sampling_logp_difference/mean": 0.013510177657008172, + "step": 890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1600.8, + "completions/max_terminated_length": 1600.8, + "completions/mean_length": 982.44375, + "completions/mean_terminated_length": 982.44375, + "completions/min_length": 499.2, + "completions/min_terminated_length": 499.2, + "entropy": 0.2803249657154083, + "epoch": 1.0517038777908343, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8509756326675415, + "learning_rate": 4.071965107826508e-07, + "loss": 0.0015, + "num_tokens": 121402178.0, + "reward": 0.6618229389190674, + "reward_std": 0.13814076781272888, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6618229389190674, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3093242943286896, + "sampling/importance_sampling_ratio/max": 1.9148869276046754, + "sampling/importance_sampling_ratio/mean": 0.9999146819114685, + "sampling/importance_sampling_ratio/min": 0.3448833405971527, + "sampling/sampling_logp_difference/max": 1.3441654443740845, + "sampling/sampling_logp_difference/mean": 0.013470856286585332, + "step": 895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1447.6, + "completions/max_terminated_length": 1447.6, + "completions/mean_length": 971.4125, + "completions/mean_terminated_length": 971.4125, + "completions/min_length": 522.4, + "completions/min_terminated_length": 522.4, + "entropy": 0.28140476942062376, + "epoch": 1.0575793184488838, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7652623057365417, + "learning_rate": 4.065907438817543e-07, + "loss": 0.0049, + "num_tokens": 122006054.0, + "reward": 0.7295312762260437, + "reward_std": 0.10282711908221245, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7295312762260437, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28697769343852997, + "sampling/importance_sampling_ratio/max": 1.9570077180862426, + "sampling/importance_sampling_ratio/mean": 0.9998312950134277, + "sampling/importance_sampling_ratio/min": 0.35704835057258605, + "sampling/sampling_logp_difference/max": 1.3200425148010253, + "sampling/sampling_logp_difference/mean": 0.013536373898386956, + "step": 900 + }, + { + "epoch": 1.0575793184488838, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1455.64, + "eval_completions/max_terminated_length": 1455.64, + "eval_completions/mean_length": 907.501875, + "eval_completions/mean_terminated_length": 907.501875, + "eval_completions/min_length": 554.48, + "eval_completions/min_terminated_length": 554.48, + "eval_entropy": 0.2828002864122391, + "eval_frac_reward_zero_std": 0.41, + "eval_loss": 0.0018464005552232265, + "eval_num_tokens": 122006054.0, + "eval_reward": 0.6610416805744171, + "eval_reward_std": 0.12711612805724143, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6610416841506958, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3318196302652359, + "eval_runtime": 377.4818, + "eval_samples_per_second": 0.265, + "eval_sampling/importance_sampling_ratio/max": 1.9399487686157226, + "eval_sampling/importance_sampling_ratio/mean": 1.0000247478485107, + "eval_sampling/importance_sampling_ratio/min": 0.34321905925869944, + "eval_sampling/sampling_logp_difference/max": 1.2090392780303956, + "eval_sampling/sampling_logp_difference/mean": 0.013470363169908524, + "eval_steps_per_second": 0.005, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1200.4, + "completions/max_terminated_length": 1200.4, + "completions/mean_length": 885.95625, + "completions/mean_terminated_length": 885.95625, + "completions/min_length": 559.6, + "completions/min_terminated_length": 559.6, + "entropy": 0.2904858112335205, + "epoch": 1.063454759106933, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.5744009613990784, + "learning_rate": 4.059849769808578e-07, + "loss": 0.0104, + "num_tokens": 122606664.0, + "reward": 0.7563021063804627, + "reward_std": 0.12077359333634377, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7563021063804627, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25939937233924865, + "sampling/importance_sampling_ratio/max": 1.9366278409957887, + "sampling/importance_sampling_ratio/mean": 1.0000674724578857, + "sampling/importance_sampling_ratio/min": 0.3904744863510132, + "sampling/sampling_logp_difference/max": 1.0748745203018188, + "sampling/sampling_logp_difference/mean": 0.013862324692308903, + "step": 905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1405.6, + "completions/max_terminated_length": 1405.6, + "completions/mean_length": 947.584375, + "completions/mean_terminated_length": 947.584375, + "completions/min_length": 553.2, + "completions/min_terminated_length": 553.2, + "entropy": 0.28678281903266906, + "epoch": 1.0693301997649824, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8974121809005737, + "learning_rate": 4.053792100799612e-07, + "loss": 0.0017, + "num_tokens": 123210147.0, + "reward": 0.6382291734218597, + "reward_std": 0.15600190162658692, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6382291734218597, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3487572968006134, + "sampling/importance_sampling_ratio/max": 1.9583675384521484, + "sampling/importance_sampling_ratio/mean": 0.9999847769737243, + "sampling/importance_sampling_ratio/min": 0.3770772695541382, + "sampling/sampling_logp_difference/max": 1.145704698562622, + "sampling/sampling_logp_difference/mean": 0.013637512736022473, + "step": 910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1463.6, + "completions/max_terminated_length": 1463.6, + "completions/mean_length": 1009.46875, + "completions/mean_terminated_length": 1009.46875, + "completions/min_length": 643.2, + "completions/min_terminated_length": 643.2, + "entropy": 0.2971984803676605, + "epoch": 1.0752056404230317, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7462270259857178, + "learning_rate": 4.047734431790647e-07, + "loss": -0.0047, + "num_tokens": 123839481.0, + "reward": 0.7052083373069763, + "reward_std": 0.16606248915195465, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7052083373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27336723506450655, + "sampling/importance_sampling_ratio/max": 1.9635993242263794, + "sampling/importance_sampling_ratio/mean": 1.0000979065895081, + "sampling/importance_sampling_ratio/min": 0.313095235824585, + "sampling/sampling_logp_difference/max": 1.3117567539215087, + "sampling/sampling_logp_difference/mean": 0.013715284876525402, + "step": 915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1282.0, + "completions/max_terminated_length": 1282.0, + "completions/mean_length": 916.96875, + "completions/mean_terminated_length": 916.96875, + "completions/min_length": 545.4, + "completions/min_terminated_length": 545.4, + "entropy": 0.26898905336856843, + "epoch": 1.0810810810810811, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 4.0416767627816815e-07, + "loss": -0.0374, + "num_tokens": 124465855.0, + "reward": 0.7244791984558105, + "reward_std": 0.0911778524518013, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7244791984558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2757032006978989, + "sampling/importance_sampling_ratio/max": 1.7921949625015259, + "sampling/importance_sampling_ratio/mean": 0.9998450398445129, + "sampling/importance_sampling_ratio/min": 0.2510555416345596, + "sampling/sampling_logp_difference/max": 1.4895573616027833, + "sampling/sampling_logp_difference/mean": 0.013172058574855327, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1380.6, + "completions/max_terminated_length": 1380.6, + "completions/mean_length": 949.85625, + "completions/mean_terminated_length": 949.85625, + "completions/min_length": 602.4, + "completions/min_terminated_length": 602.4, + "entropy": 0.27048816680908205, + "epoch": 1.0869565217391304, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.9172391891479492, + "learning_rate": 4.035619093772716e-07, + "loss": 0.0013, + "num_tokens": 125084081.0, + "reward": 0.6635416746139526, + "reward_std": 0.13708136826753617, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6635416746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3219247102737427, + "sampling/importance_sampling_ratio/max": 1.861747407913208, + "sampling/importance_sampling_ratio/mean": 0.9998850226402283, + "sampling/importance_sampling_ratio/min": 0.297413569688797, + "sampling/sampling_logp_difference/max": 1.513976526260376, + "sampling/sampling_logp_difference/mean": 0.012974279746413232, + "step": 925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1404.2, + "completions/max_terminated_length": 1404.2, + "completions/mean_length": 947.790625, + "completions/mean_terminated_length": 947.790625, + "completions/min_length": 576.0, + "completions/min_terminated_length": 576.0, + "entropy": 0.2738521546125412, + "epoch": 1.0928319623971798, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.8329576849937439, + "learning_rate": 4.0295614247637507e-07, + "loss": -0.0001, + "num_tokens": 125721630.0, + "reward": 0.7278645873069763, + "reward_std": 0.13716669231653214, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7278645873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2882854163646698, + "sampling/importance_sampling_ratio/max": 1.9897720098495484, + "sampling/importance_sampling_ratio/mean": 1.0000043034553527, + "sampling/importance_sampling_ratio/min": 0.3875389933586121, + "sampling/sampling_logp_difference/max": 1.3065906524658204, + "sampling/sampling_logp_difference/mean": 0.013173772767186166, + "step": 930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1520.4, + "completions/max_terminated_length": 1520.4, + "completions/mean_length": 953.040625, + "completions/mean_terminated_length": 953.040625, + "completions/min_length": 603.6, + "completions/min_terminated_length": 603.6, + "entropy": 0.2765673935413361, + "epoch": 1.098707403055229, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.8879562616348267, + "learning_rate": 4.023503755754785e-07, + "loss": -0.0082, + "num_tokens": 126360139.0, + "reward": 0.6124479234218597, + "reward_std": 0.17750487923622132, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6124479234218597, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2882064312696457, + "sampling/importance_sampling_ratio/max": 1.9080177783966064, + "sampling/importance_sampling_ratio/mean": 0.9999000906944275, + "sampling/importance_sampling_ratio/min": 0.3805552273988724, + "sampling/sampling_logp_difference/max": 1.0973583936691285, + "sampling/sampling_logp_difference/mean": 0.013284470327198506, + "step": 935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1582.0, + "completions/max_terminated_length": 1582.0, + "completions/mean_length": 971.50625, + "completions/mean_terminated_length": 971.50625, + "completions/min_length": 586.8, + "completions/min_terminated_length": 586.8, + "entropy": 0.3045738399028778, + "epoch": 1.1045828437132785, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.7119807600975037, + "learning_rate": 4.01744608674582e-07, + "loss": -0.0073, + "num_tokens": 127001389.0, + "reward": 0.7123437762260437, + "reward_std": 0.17186959385871886, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7123437762260437, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3540113389492035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000130653381347, + "sampling/importance_sampling_ratio/min": 0.3947632074356079, + "sampling/sampling_logp_difference/max": 1.1084512948989869, + "sampling/sampling_logp_difference/mean": 0.014095421135425567, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1514.0, + "completions/max_terminated_length": 1514.0, + "completions/mean_length": 969.684375, + "completions/mean_terminated_length": 969.684375, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "entropy": 0.30098283290863037, + "epoch": 1.1104582843713278, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8372572064399719, + "learning_rate": 4.0113884177368543e-07, + "loss": -0.004, + "num_tokens": 127621528.0, + "reward": 0.6710937559604645, + "reward_std": 0.1159849688410759, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6710937559604645, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3547815322875977, + "sampling/importance_sampling_ratio/max": 1.9597235918045044, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.4007263362407684, + "sampling/sampling_logp_difference/max": 1.044080376625061, + "sampling/sampling_logp_difference/mean": 0.014144887030124665, + "step": 945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1583.0, + "completions/max_terminated_length": 1583.0, + "completions/mean_length": 988.59375, + "completions/mean_terminated_length": 988.59375, + "completions/min_length": 660.8, + "completions/min_terminated_length": 660.8, + "entropy": 0.2859313428401947, + "epoch": 1.1163337250293772, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7519243955612183, + "learning_rate": 4.005330748727889e-07, + "loss": 0.0136, + "num_tokens": 128225958.0, + "reward": 0.756458330154419, + "reward_std": 0.13837233185768127, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.756458330154419, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24186617136001587, + "sampling/importance_sampling_ratio/max": 1.8979056119918822, + "sampling/importance_sampling_ratio/mean": 1.0001055955886842, + "sampling/importance_sampling_ratio/min": 0.31551241781562567, + "sampling/sampling_logp_difference/max": 2.015387845039368, + "sampling/sampling_logp_difference/mean": 0.013428857550024986, + "step": 950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1616.8, + "completions/max_terminated_length": 1616.8, + "completions/mean_length": 1025.23125, + "completions/mean_terminated_length": 1025.23125, + "completions/min_length": 682.4, + "completions/min_terminated_length": 682.4, + "entropy": 0.30995495319366456, + "epoch": 1.1222091656874265, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.8149730563163757, + "learning_rate": 3.999273079718924e-07, + "loss": 0.0003, + "num_tokens": 128900864.0, + "reward": 0.6762500166893005, + "reward_std": 0.1645580381155014, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6762500166893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25772719383239745, + "sampling/importance_sampling_ratio/max": 1.9383854150772095, + "sampling/importance_sampling_ratio/mean": 1.0000874400138855, + "sampling/importance_sampling_ratio/min": 0.3173495039343834, + "sampling/sampling_logp_difference/max": 1.5993350982666015, + "sampling/sampling_logp_difference/mean": 0.013984563015401363, + "step": 955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1401.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 988.059375, + "completions/mean_terminated_length": 988.059375, + "completions/min_length": 628.2, + "completions/min_terminated_length": 628.2, + "entropy": 0.3168246328830719, + "epoch": 1.128084606345476, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.6208041906356812, + "learning_rate": 3.9932154107099585e-07, + "loss": 0.0121, + "num_tokens": 129572115.0, + "reward": 0.6992708444595337, + "reward_std": 0.14730916619300843, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6992708444595337, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27717364132404326, + "sampling/importance_sampling_ratio/max": 1.9295872926712037, + "sampling/importance_sampling_ratio/mean": 1.0002163410186768, + "sampling/importance_sampling_ratio/min": 0.39420167207717893, + "sampling/sampling_logp_difference/max": 1.2145282983779908, + "sampling/sampling_logp_difference/mean": 0.014649266377091408, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1293.8, + "completions/max_terminated_length": 1293.8, + "completions/mean_length": 939.94375, + "completions/mean_terminated_length": 939.94375, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "entropy": 0.28997875452041627, + "epoch": 1.1339600470035252, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8977941274642944, + "learning_rate": 3.9871577417009934e-07, + "loss": -0.0042, + "num_tokens": 130222945.0, + "reward": 0.7091145992279053, + "reward_std": 0.14130303710699083, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7091145992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31121143102645876, + "sampling/importance_sampling_ratio/max": 1.9547757387161255, + "sampling/importance_sampling_ratio/mean": 0.9999549508094787, + "sampling/importance_sampling_ratio/min": 0.3619807779788971, + "sampling/sampling_logp_difference/max": 1.2246970176696776, + "sampling/sampling_logp_difference/mean": 0.013661802746355534, + "step": 965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1465.6, + "completions/max_terminated_length": 1465.6, + "completions/mean_length": 1035.078125, + "completions/mean_terminated_length": 1035.078125, + "completions/min_length": 594.8, + "completions/min_terminated_length": 594.8, + "entropy": 0.2883412778377533, + "epoch": 1.1398354876615746, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8469650149345398, + "learning_rate": 3.981100072692028e-07, + "loss": -0.0027, + "num_tokens": 130881018.0, + "reward": 0.6606770992279053, + "reward_std": 0.1474784180521965, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6606770992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36500520408153536, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000018060207367, + "sampling/importance_sampling_ratio/min": 0.3527994304895401, + "sampling/sampling_logp_difference/max": 1.2171527862548828, + "sampling/sampling_logp_difference/mean": 0.013521765917539596, + "step": 970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1390.8, + "completions/max_terminated_length": 1390.8, + "completions/mean_length": 1007.9625, + "completions/mean_terminated_length": 1007.9625, + "completions/min_length": 686.8, + "completions/min_terminated_length": 686.8, + "entropy": 0.284423416852951, + "epoch": 1.145710928319624, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6974943280220032, + "learning_rate": 3.9750424036830626e-07, + "loss": -0.0051, + "num_tokens": 131522462.0, + "reward": 0.568177092075348, + "reward_std": 0.10882167518138885, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.568177092075348, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3816623717546463, + "sampling/importance_sampling_ratio/max": 1.7701248168945312, + "sampling/importance_sampling_ratio/mean": 0.9999536991119384, + "sampling/importance_sampling_ratio/min": 0.45498186349868774, + "sampling/sampling_logp_difference/max": 0.8206616401672363, + "sampling/sampling_logp_difference/mean": 0.01323307417333126, + "step": 975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1590.4, + "completions/max_terminated_length": 1590.4, + "completions/mean_length": 1085.0625, + "completions/mean_terminated_length": 1085.0625, + "completions/min_length": 653.0, + "completions/min_terminated_length": 653.0, + "entropy": 0.3135793745517731, + "epoch": 1.1515863689776733, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.0624127388000488, + "learning_rate": 3.9689847346740975e-07, + "loss": -0.0149, + "num_tokens": 132210226.0, + "reward": 0.6392708539962768, + "reward_std": 0.14114319533109665, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6392708539962768, + "rewards/e2e_recall_precision_mixed_reward/std": 0.35320115089416504, + "sampling/importance_sampling_ratio/max": 1.7919421195983887, + "sampling/importance_sampling_ratio/mean": 0.9998708367347717, + "sampling/importance_sampling_ratio/min": 0.30457684099674226, + "sampling/sampling_logp_difference/max": 1.2323057889938354, + "sampling/sampling_logp_difference/mean": 0.014576360583305359, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1604.6, + "completions/max_terminated_length": 1604.6, + "completions/mean_length": 1116.96875, + "completions/mean_terminated_length": 1116.96875, + "completions/min_length": 598.8, + "completions/min_terminated_length": 598.8, + "entropy": 0.32797098755836485, + "epoch": 1.1574618096357228, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.6008588671684265, + "learning_rate": 3.962927065665132e-07, + "loss": 0.0174, + "num_tokens": 132912392.0, + "reward": 0.6921354413032532, + "reward_std": 0.18891476839780807, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6921354413032532, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2808209300041199, + "sampling/importance_sampling_ratio/max": 1.99753577709198, + "sampling/importance_sampling_ratio/mean": 0.9998470783233643, + "sampling/importance_sampling_ratio/min": 0.323549946770072, + "sampling/sampling_logp_difference/max": 1.5377471923828125, + "sampling/sampling_logp_difference/mean": 0.015015862323343754, + "step": 985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1794.0, + "completions/max_terminated_length": 1599.8, + "completions/mean_length": 1081.353125, + "completions/mean_terminated_length": 1076.6255126953124, + "completions/min_length": 662.0, + "completions/min_terminated_length": 662.0, + "entropy": 0.31267160177230835, + "epoch": 1.163337250293772, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.7435998916625977, + "learning_rate": 3.956869396656167e-07, + "loss": -0.0173, + "num_tokens": 133587877.0, + "reward": 0.528125011920929, + "reward_std": 0.13365225195884706, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.528125011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.35800984501838684, + "sampling/importance_sampling_ratio/max": 1.9870172500610352, + "sampling/importance_sampling_ratio/mean": 0.9999597549438477, + "sampling/importance_sampling_ratio/min": 0.4234741389751434, + "sampling/sampling_logp_difference/max": 0.9320145964622497, + "sampling/sampling_logp_difference/mean": 0.01449219174683094, + "step": 990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 1044.7875, + "completions/mean_terminated_length": 1044.7875, + "completions/min_length": 646.2, + "completions/min_terminated_length": 646.2, + "entropy": 0.31615627408027647, + "epoch": 1.1692126909518215, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6654034852981567, + "learning_rate": 3.950811727647201e-07, + "loss": 0.0099, + "num_tokens": 134243985.0, + "reward": 0.8039062738418579, + "reward_std": 0.09408158212900161, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8039062738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2852950572967529, + "sampling/importance_sampling_ratio/max": 1.8471860408782959, + "sampling/importance_sampling_ratio/mean": 0.9998784899711609, + "sampling/importance_sampling_ratio/min": 0.3083785384893417, + "sampling/sampling_logp_difference/max": 1.310741949081421, + "sampling/sampling_logp_difference/mean": 0.014296729303896426, + "step": 995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1674.8, + "completions/max_terminated_length": 1674.8, + "completions/mean_length": 1010.9375, + "completions/mean_terminated_length": 1010.9375, + "completions/min_length": 551.2, + "completions/min_terminated_length": 551.2, + "entropy": 0.30919942259788513, + "epoch": 1.1750881316098707, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.797741174697876, + "learning_rate": 3.944754058638236e-07, + "loss": -0.0179, + "num_tokens": 134896445.0, + "reward": 0.6697916746139526, + "reward_std": 0.14145222902297974, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6697916746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3327239155769348, + "sampling/importance_sampling_ratio/max": 1.9311280488967895, + "sampling/importance_sampling_ratio/mean": 0.9998689293861389, + "sampling/importance_sampling_ratio/min": 0.2995154604315758, + "sampling/sampling_logp_difference/max": 1.3440932035446167, + "sampling/sampling_logp_difference/mean": 0.014279096573591232, + "step": 1000 + }, + { + "epoch": 1.1750881316098707, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.000625, + "eval_completions/max_length": 1595.28, + "eval_completions/max_terminated_length": 1578.0, + "eval_completions/mean_length": 1022.27875, + "eval_completions/mean_terminated_length": 1021.36798828125, + "eval_completions/min_length": 659.68, + "eval_completions/min_terminated_length": 659.68, + "eval_entropy": 0.3078763961791992, + "eval_frac_reward_zero_std": 0.39, + "eval_loss": 0.0021373536437749863, + "eval_num_tokens": 134896445.0, + "eval_reward": 0.6751354324817658, + "eval_reward_std": 0.12967597171664239, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6751354372501374, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3238289541006088, + "eval_runtime": 414.1064, + "eval_samples_per_second": 0.241, + "eval_sampling/importance_sampling_ratio/max": 1.892482771873474, + "eval_sampling/importance_sampling_ratio/mean": 0.9999824357032776, + "eval_sampling/importance_sampling_ratio/min": 0.366875017285347, + "eval_sampling/sampling_logp_difference/max": 1.0593089628219605, + "eval_sampling/sampling_logp_difference/mean": 0.01425721075385809, + "eval_steps_per_second": 0.005, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1866.6, + "completions/max_terminated_length": 1866.6, + "completions/mean_length": 1096.41875, + "completions/mean_terminated_length": 1096.41875, + "completions/min_length": 782.2, + "completions/min_terminated_length": 782.2, + "entropy": 0.31382623314857483, + "epoch": 1.1809635722679201, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.8752436637878418, + "learning_rate": 3.938696389629271e-07, + "loss": 0.0053, + "num_tokens": 135588419.0, + "reward": 0.7271875023841858, + "reward_std": 0.14706478863954545, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7271875023841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23389651179313659, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000099420547486, + "sampling/importance_sampling_ratio/min": 0.2783097416162491, + "sampling/sampling_logp_difference/max": 1.292723035812378, + "sampling/sampling_logp_difference/mean": 0.014573409222066402, + "step": 1005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1822.0, + "completions/max_terminated_length": 1822.0, + "completions/mean_length": 1118.215625, + "completions/mean_terminated_length": 1118.215625, + "completions/min_length": 695.8, + "completions/min_terminated_length": 695.8, + "entropy": 0.3018251657485962, + "epoch": 1.1868390129259694, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8171986937522888, + "learning_rate": 3.932638720620305e-07, + "loss": -0.0098, + "num_tokens": 136260632.0, + "reward": 0.7615625262260437, + "reward_std": 0.12762271910905837, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7615625262260437, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2775488555431366, + "sampling/importance_sampling_ratio/max": 1.9084330320358276, + "sampling/importance_sampling_ratio/mean": 0.9998735070228577, + "sampling/importance_sampling_ratio/min": 0.37997121512889864, + "sampling/sampling_logp_difference/max": 1.0514094591140748, + "sampling/sampling_logp_difference/mean": 0.013783762976527215, + "step": 1010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1724.4, + "completions/max_terminated_length": 1610.0, + "completions/mean_length": 1068.925, + "completions/mean_terminated_length": 1064.7508544921875, + "completions/min_length": 646.6, + "completions/min_terminated_length": 646.6, + "entropy": 0.31577218770980836, + "epoch": 1.1927144535840188, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9301170706748962, + "learning_rate": 3.9265810516113397e-07, + "loss": -0.0118, + "num_tokens": 136912540.0, + "reward": 0.66307293176651, + "reward_std": 0.1278460681438446, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.66307293176651, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3507560431957245, + "sampling/importance_sampling_ratio/max": 1.98316330909729, + "sampling/importance_sampling_ratio/mean": 0.999949038028717, + "sampling/importance_sampling_ratio/min": 0.42532923221588137, + "sampling/sampling_logp_difference/max": 0.8826199054718018, + "sampling/sampling_logp_difference/mean": 0.014387455582618714, + "step": 1015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1700.6, + "completions/max_terminated_length": 1700.6, + "completions/mean_length": 1082.975, + "completions/mean_terminated_length": 1082.975, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.3057791173458099, + "epoch": 1.198589894242068, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.6882150173187256, + "learning_rate": 3.920523382602374e-07, + "loss": 0.0127, + "num_tokens": 137576788.0, + "reward": 0.6527604460716248, + "reward_std": 0.15843599140644074, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6527604460716248, + "rewards/e2e_recall_precision_mixed_reward/std": 0.35246109366416933, + "sampling/importance_sampling_ratio/max": 1.9179187059402465, + "sampling/importance_sampling_ratio/mean": 1.0000202059745789, + "sampling/importance_sampling_ratio/min": 0.3111193537712097, + "sampling/sampling_logp_difference/max": 1.2015019178390502, + "sampling/sampling_logp_difference/mean": 0.014157050289213657, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1428.8, + "completions/max_terminated_length": 1428.8, + "completions/mean_length": 1032.625, + "completions/mean_terminated_length": 1032.625, + "completions/min_length": 639.8, + "completions/min_terminated_length": 639.8, + "entropy": 0.3173108518123627, + "epoch": 1.2044653349001175, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.774826169013977, + "learning_rate": 3.914465713593409e-07, + "loss": 0.0059, + "num_tokens": 138243644.0, + "reward": 0.7501562595367431, + "reward_std": 0.12034987509250641, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7501562595367431, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27434692680835726, + "sampling/importance_sampling_ratio/max": 1.977065873146057, + "sampling/importance_sampling_ratio/mean": 1.0000038266181945, + "sampling/importance_sampling_ratio/min": 0.40885123908519744, + "sampling/sampling_logp_difference/max": 1.1413819074630738, + "sampling/sampling_logp_difference/mean": 0.014192923717200757, + "step": 1025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1571.8, + "completions/max_terminated_length": 1571.8, + "completions/mean_length": 1090.91875, + "completions/mean_terminated_length": 1090.91875, + "completions/min_length": 744.8, + "completions/min_terminated_length": 744.8, + "entropy": 0.32138744592666624, + "epoch": 1.2103407755581668, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4858890771865845, + "learning_rate": 3.908408044584444e-07, + "loss": 0.0088, + "num_tokens": 138915474.0, + "reward": 0.7064583480358124, + "reward_std": 0.06657353341579438, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7064583480358124, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30555517971515656, + "sampling/importance_sampling_ratio/max": 1.9622070789337158, + "sampling/importance_sampling_ratio/mean": 1.0000203251838684, + "sampling/importance_sampling_ratio/min": 0.3662696361541748, + "sampling/sampling_logp_difference/max": 1.2498675346374513, + "sampling/sampling_logp_difference/mean": 0.014360511116683483, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1514.2, + "completions/max_terminated_length": 1514.2, + "completions/mean_length": 1031.846875, + "completions/mean_terminated_length": 1031.846875, + "completions/min_length": 668.6, + "completions/min_terminated_length": 668.6, + "entropy": 0.30652998089790345, + "epoch": 1.2162162162162162, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7699494957923889, + "learning_rate": 3.902350375575478e-07, + "loss": 0.009, + "num_tokens": 139571185.0, + "reward": 0.7100000023841858, + "reward_std": 0.0897395059466362, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7100000023841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3180919706821442, + "sampling/importance_sampling_ratio/max": 1.8632827520370483, + "sampling/importance_sampling_ratio/mean": 0.9999406576156616, + "sampling/importance_sampling_ratio/min": 0.3360584322363138, + "sampling/sampling_logp_difference/max": 1.4799968481063843, + "sampling/sampling_logp_difference/mean": 0.014310248382389545, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1635.2, + "completions/max_terminated_length": 1635.2, + "completions/mean_length": 1085.465625, + "completions/mean_terminated_length": 1085.465625, + "completions/min_length": 661.8, + "completions/min_terminated_length": 661.8, + "entropy": 0.31891674995422364, + "epoch": 1.2220916568742655, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.8770401477813721, + "learning_rate": 3.896292706566513e-07, + "loss": -0.0065, + "num_tokens": 140219510.0, + "reward": 0.646302092075348, + "reward_std": 0.18240397274494172, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.646302092075348, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3216747730970383, + "sampling/importance_sampling_ratio/max": 1.9571067094802856, + "sampling/importance_sampling_ratio/mean": 1.0000643253326416, + "sampling/importance_sampling_ratio/min": 0.35792707204818724, + "sampling/sampling_logp_difference/max": 1.0642118215560914, + "sampling/sampling_logp_difference/mean": 0.014467264525592327, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1492.8, + "completions/max_terminated_length": 1492.8, + "completions/mean_length": 1060.609375, + "completions/mean_terminated_length": 1060.609375, + "completions/min_length": 768.6, + "completions/min_terminated_length": 768.6, + "entropy": 0.2915258467197418, + "epoch": 1.227967097532315, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.8293555378913879, + "learning_rate": 3.8902350375575474e-07, + "loss": 0.0039, + "num_tokens": 140889625.0, + "reward": 0.7548958539962769, + "reward_std": 0.12820187769830227, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7548958539962769, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26064820885658263, + "sampling/importance_sampling_ratio/max": 1.998341941833496, + "sampling/importance_sampling_ratio/mean": 0.9999308705329895, + "sampling/importance_sampling_ratio/min": 0.31071800738573074, + "sampling/sampling_logp_difference/max": 1.4954757213592529, + "sampling/sampling_logp_difference/mean": 0.013605404086411, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1621.8, + "completions/max_terminated_length": 1621.8, + "completions/mean_length": 971.846875, + "completions/mean_terminated_length": 971.846875, + "completions/min_length": 557.6, + "completions/min_terminated_length": 557.6, + "entropy": 0.29920910596847533, + "epoch": 1.2338425381903644, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6538206338882446, + "learning_rate": 3.8841773685485823e-07, + "loss": 0.0112, + "num_tokens": 141503960.0, + "reward": 0.6366666674613952, + "reward_std": 0.09315153658390045, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6366666793823242, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2838348388671875, + "sampling/importance_sampling_ratio/max": 1.9519197940826416, + "sampling/importance_sampling_ratio/mean": 1.000203275680542, + "sampling/importance_sampling_ratio/min": 0.3838867276906967, + "sampling/sampling_logp_difference/max": 1.0728678226470947, + "sampling/sampling_logp_difference/mean": 0.014343824982643128, + "step": 1050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1641.4, + "completions/max_terminated_length": 1641.4, + "completions/mean_length": 1104.896875, + "completions/mean_terminated_length": 1104.896875, + "completions/min_length": 646.0, + "completions/min_terminated_length": 646.0, + "entropy": 0.2859143793582916, + "epoch": 1.2397179788484136, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4819332957267761, + "learning_rate": 3.878119699539617e-07, + "loss": 0.0001, + "num_tokens": 142201559.0, + "reward": 0.6817708432674408, + "reward_std": 0.08722722977399826, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6817708432674408, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3009837418794632, + "sampling/importance_sampling_ratio/max": 1.929477906227112, + "sampling/importance_sampling_ratio/mean": 0.9999855399131775, + "sampling/importance_sampling_ratio/min": 0.3711371779441833, + "sampling/sampling_logp_difference/max": 1.088106060028076, + "sampling/sampling_logp_difference/mean": 0.01334429495036602, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1336.4, + "completions/max_terminated_length": 1336.4, + "completions/mean_length": 954.615625, + "completions/mean_terminated_length": 954.615625, + "completions/min_length": 709.0, + "completions/min_terminated_length": 709.0, + "entropy": 0.2719797283411026, + "epoch": 1.245593419506463, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7902669310569763, + "learning_rate": 3.8720620305306516e-07, + "loss": 0.0013, + "num_tokens": 142837324.0, + "reward": 0.7114583492279053, + "reward_std": 0.12315647304058075, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7114583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28056967854499815, + "sampling/importance_sampling_ratio/max": 1.9814757108688354, + "sampling/importance_sampling_ratio/mean": 1.000010859966278, + "sampling/importance_sampling_ratio/min": 0.33336481153965, + "sampling/sampling_logp_difference/max": 1.292347240447998, + "sampling/sampling_logp_difference/mean": 0.012967149168252945, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1552.4, + "completions/max_terminated_length": 1552.4, + "completions/mean_length": 1021.371875, + "completions/mean_terminated_length": 1021.371875, + "completions/min_length": 677.2, + "completions/min_terminated_length": 677.2, + "entropy": 0.2754159212112427, + "epoch": 1.2514688601645123, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4861776530742645, + "learning_rate": 3.8660043615216865e-07, + "loss": -0.0063, + "num_tokens": 143497939.0, + "reward": 0.7752604246139526, + "reward_std": 0.10692294090986251, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7752604246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2773577839136124, + "sampling/importance_sampling_ratio/max": 1.9510645627975465, + "sampling/importance_sampling_ratio/mean": 1.00001460313797, + "sampling/importance_sampling_ratio/min": 0.3078484356403351, + "sampling/sampling_logp_difference/max": 1.2806957244873047, + "sampling/sampling_logp_difference/mean": 0.013292433321475982, + "step": 1065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1440.6, + "completions/max_terminated_length": 1440.6, + "completions/mean_length": 998.9875, + "completions/mean_terminated_length": 998.9875, + "completions/min_length": 659.6, + "completions/min_terminated_length": 659.6, + "entropy": 0.2834341287612915, + "epoch": 1.2573443008225618, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6322982907295227, + "learning_rate": 3.859946692512721e-07, + "loss": 0.0005, + "num_tokens": 144123663.0, + "reward": 0.8090625166893005, + "reward_std": 0.11898693442344666, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8090625166893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23603131622076035, + "sampling/importance_sampling_ratio/max": 1.8387038946151733, + "sampling/importance_sampling_ratio/mean": 1.0000056982040406, + "sampling/importance_sampling_ratio/min": 0.3637888193130493, + "sampling/sampling_logp_difference/max": 1.0442706823349, + "sampling/sampling_logp_difference/mean": 0.013553647883236408, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1840.4, + "completions/max_terminated_length": 1650.4, + "completions/mean_length": 1059.684375, + "completions/mean_terminated_length": 1055.2914306640625, + "completions/min_length": 690.4, + "completions/min_terminated_length": 690.4, + "entropy": 0.29739258885383607, + "epoch": 1.263219741480611, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.9271990656852722, + "learning_rate": 3.853889023503756e-07, + "loss": -0.0163, + "num_tokens": 144791014.0, + "reward": 0.6409895896911622, + "reward_std": 0.13399946838617324, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6409896016120911, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3103723287582397, + "sampling/importance_sampling_ratio/max": 1.9143768310546876, + "sampling/importance_sampling_ratio/mean": 1.000032413005829, + "sampling/importance_sampling_ratio/min": 0.37732569575309755, + "sampling/sampling_logp_difference/max": 1.162096655368805, + "sampling/sampling_logp_difference/mean": 0.013873641937971115, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1678.8, + "completions/max_terminated_length": 1678.8, + "completions/mean_length": 1075.20625, + "completions/mean_terminated_length": 1075.20625, + "completions/min_length": 751.6, + "completions/min_terminated_length": 751.6, + "entropy": 0.28603232502937315, + "epoch": 1.2690951821386605, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.3865974247455597, + "learning_rate": 3.8478313544947906e-07, + "loss": -0.0016, + "num_tokens": 145442248.0, + "reward": 0.716979193687439, + "reward_std": 0.1147857926785946, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.716979193687439, + "rewards/e2e_recall_precision_mixed_reward/std": 0.35159227848052976, + "sampling/importance_sampling_ratio/max": 1.8565044403076172, + "sampling/importance_sampling_ratio/mean": 0.9999043822288514, + "sampling/importance_sampling_ratio/min": 0.2803425773978233, + "sampling/sampling_logp_difference/max": 1.4406961679458619, + "sampling/sampling_logp_difference/mean": 0.013564172014594079, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1652.2, + "completions/max_terminated_length": 1652.2, + "completions/mean_length": 1100.740625, + "completions/mean_terminated_length": 1100.740625, + "completions/min_length": 779.6, + "completions/min_terminated_length": 779.6, + "entropy": 0.2707725286483765, + "epoch": 1.2749706227967097, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7248564958572388, + "learning_rate": 3.841773685485825e-07, + "loss": 0.0048, + "num_tokens": 146117157.0, + "reward": 0.7029687643051148, + "reward_std": 0.10289829894900322, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7029687643051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30764630138874055, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 0.19010883904993534, + "sampling/sampling_logp_difference/max": 1.9769764423370362, + "sampling/sampling_logp_difference/mean": 0.013046731427311897, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1497.0, + "completions/max_terminated_length": 1497.0, + "completions/mean_length": 1038.95, + "completions/mean_terminated_length": 1038.95, + "completions/min_length": 711.4, + "completions/min_terminated_length": 711.4, + "entropy": 0.2841998040676117, + "epoch": 1.2808460634547592, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4237470030784607, + "learning_rate": 3.8357160164768594e-07, + "loss": 0.0057, + "num_tokens": 146740245.0, + "reward": 0.784166669845581, + "reward_std": 0.10403337031602859, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.784166669845581, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27793429493904115, + "sampling/importance_sampling_ratio/max": 1.942676877975464, + "sampling/importance_sampling_ratio/mean": 0.9999974012374878, + "sampling/importance_sampling_ratio/min": 0.368558007478714, + "sampling/sampling_logp_difference/max": 1.1511848449707032, + "sampling/sampling_logp_difference/mean": 0.013521903567016125, + "step": 1090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1358.6, + "completions/max_terminated_length": 1358.6, + "completions/mean_length": 1007.3875, + "completions/mean_terminated_length": 1007.3875, + "completions/min_length": 683.4, + "completions/min_terminated_length": 683.4, + "entropy": 0.2815568208694458, + "epoch": 1.2867215041128084, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7796571254730225, + "learning_rate": 3.829658347467894e-07, + "loss": 0.003, + "num_tokens": 147365153.0, + "reward": 0.7693750143051148, + "reward_std": 0.11561888605356216, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7693750143051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2604265660047531, + "sampling/importance_sampling_ratio/max": 1.8576125860214234, + "sampling/importance_sampling_ratio/mean": 0.9999807476997375, + "sampling/importance_sampling_ratio/min": 0.3188953049480915, + "sampling/sampling_logp_difference/max": 1.65875186920166, + "sampling/sampling_logp_difference/mean": 0.013848881609737873, + "step": 1095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1706.8, + "completions/max_terminated_length": 1706.8, + "completions/mean_length": 1156.165625, + "completions/mean_terminated_length": 1156.165625, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.29959659576416015, + "epoch": 1.2925969447708578, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7763993740081787, + "learning_rate": 3.8236006784589286e-07, + "loss": 0.0163, + "num_tokens": 148050518.0, + "reward": 0.6523437619209289, + "reward_std": 0.08477627635002136, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6523437619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3252843528985977, + "sampling/importance_sampling_ratio/max": 1.8790109157562256, + "sampling/importance_sampling_ratio/mean": 0.9999762296676635, + "sampling/importance_sampling_ratio/min": 0.27901336550712585, + "sampling/sampling_logp_difference/max": 1.3847559690475464, + "sampling/sampling_logp_difference/mean": 0.01403086856007576, + "step": 1100 + }, + { + "epoch": 1.2925969447708578, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1490.96, + "eval_completions/max_terminated_length": 1490.96, + "eval_completions/mean_length": 990.515, + "eval_completions/mean_terminated_length": 990.515, + "eval_completions/min_length": 692.08, + "eval_completions/min_terminated_length": 692.08, + "eval_entropy": 0.27925810635089876, + "eval_frac_reward_zero_std": 0.47, + "eval_loss": 0.005843394435942173, + "eval_num_tokens": 148050518.0, + "eval_reward": 0.6923645973205567, + "eval_reward_std": 0.10961450830101967, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6923645985126495, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3154654276371002, + "eval_runtime": 394.8883, + "eval_samples_per_second": 0.253, + "eval_sampling/importance_sampling_ratio/max": 1.9187990188598634, + "eval_sampling/importance_sampling_ratio/mean": 0.9999549126625061, + "eval_sampling/importance_sampling_ratio/min": 0.33063108295202254, + "eval_sampling/sampling_logp_difference/max": 1.217089729309082, + "eval_sampling/sampling_logp_difference/mean": 0.013591401651501656, + "eval_steps_per_second": 0.005, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1543.4, + "completions/max_terminated_length": 1524.4, + "completions/mean_length": 1014.178125, + "completions/mean_terminated_length": 995.0234619140625, + "completions/min_length": 694.2, + "completions/min_terminated_length": 694.2, + "entropy": 0.26261157989501954, + "epoch": 1.2984723854289073, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.5455294847488403, + "learning_rate": 3.8175430094499635e-07, + "loss": -0.0131, + "num_tokens": 148691003.0, + "reward": 0.6966146051883697, + "reward_std": 0.15783809274435043, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6966146051883697, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34295274019241334, + "sampling/importance_sampling_ratio/max": 1.9716975450515748, + "sampling/importance_sampling_ratio/mean": 1.00016188621521, + "sampling/importance_sampling_ratio/min": 0.3574655741453171, + "sampling/sampling_logp_difference/max": 1.1117752075195313, + "sampling/sampling_logp_difference/mean": 0.013009194284677505, + "step": 1105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1565.6, + "completions/max_terminated_length": 1565.6, + "completions/mean_length": 1094.09375, + "completions/mean_terminated_length": 1094.09375, + "completions/min_length": 731.4, + "completions/min_terminated_length": 731.4, + "entropy": 0.2692096889019012, + "epoch": 1.3043478260869565, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.0, + "learning_rate": 3.811485340440998e-07, + "loss": 0.0113, + "num_tokens": 149393113.0, + "reward": 0.6494791746139527, + "reward_std": 0.1348447620868683, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6494791746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3158786088228226, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000579476356506, + "sampling/importance_sampling_ratio/min": 0.29540793895721434, + "sampling/sampling_logp_difference/max": 1.2994348287582398, + "sampling/sampling_logp_difference/mean": 0.013329188153147698, + "step": 1110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1392.6, + "completions/max_terminated_length": 1392.6, + "completions/mean_length": 1017.58125, + "completions/mean_terminated_length": 1017.58125, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "entropy": 0.2770149827003479, + "epoch": 1.3102232667450058, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7833777070045471, + "learning_rate": 3.805427671432033e-07, + "loss": 0.0032, + "num_tokens": 150030707.0, + "reward": 0.6882812678813934, + "reward_std": 0.11308581605553628, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6882812678813934, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29742818176746366, + "sampling/importance_sampling_ratio/max": 1.935225486755371, + "sampling/importance_sampling_ratio/mean": 1.000083565711975, + "sampling/importance_sampling_ratio/min": 0.45306941866874695, + "sampling/sampling_logp_difference/max": 0.8295891046524048, + "sampling/sampling_logp_difference/mean": 0.013369818776845932, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1730.0, + "completions/max_terminated_length": 1730.0, + "completions/mean_length": 1053.4125, + "completions/mean_terminated_length": 1053.4125, + "completions/min_length": 710.2, + "completions/min_terminated_length": 710.2, + "entropy": 0.2534839272499084, + "epoch": 1.3160987074030552, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.5904003977775574, + "learning_rate": 3.799370002423067e-07, + "loss": 0.0026, + "num_tokens": 150675895.0, + "reward": 0.6664583504199981, + "reward_std": 0.11373435258865357, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6664583504199981, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2492772251367569, + "sampling/importance_sampling_ratio/max": 1.8575801372528076, + "sampling/importance_sampling_ratio/mean": 0.9999268770217895, + "sampling/importance_sampling_ratio/min": 0.385788106918335, + "sampling/sampling_logp_difference/max": 1.204633069038391, + "sampling/sampling_logp_difference/mean": 0.012645184434950352, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1442.8, + "completions/max_terminated_length": 1442.8, + "completions/mean_length": 1003.15, + "completions/mean_terminated_length": 1003.15, + "completions/min_length": 749.6, + "completions/min_terminated_length": 749.6, + "entropy": 0.2703131794929504, + "epoch": 1.3219741480611047, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7168449759483337, + "learning_rate": 3.793312333414102e-07, + "loss": -0.0106, + "num_tokens": 151362823.0, + "reward": 0.6812500119209289, + "reward_std": 0.1785949647426605, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6812500119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29773281812667846, + "sampling/importance_sampling_ratio/max": 1.8423972368240356, + "sampling/importance_sampling_ratio/mean": 0.9999767899513244, + "sampling/importance_sampling_ratio/min": 0.2740974217653275, + "sampling/sampling_logp_difference/max": 1.5454424619674683, + "sampling/sampling_logp_difference/mean": 0.01345563717186451, + "step": 1125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1378.8, + "completions/max_terminated_length": 1378.8, + "completions/mean_length": 955.834375, + "completions/mean_terminated_length": 955.834375, + "completions/min_length": 688.2, + "completions/min_terminated_length": 688.2, + "entropy": 0.26716753244400027, + "epoch": 1.327849588719154, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6475690007209778, + "learning_rate": 3.787254664405137e-07, + "loss": 0.0029, + "num_tokens": 151966018.0, + "reward": 0.733593761920929, + "reward_std": 0.13283937126398088, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.733593761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2886820375919342, + "sampling/importance_sampling_ratio/max": 1.9504351377487184, + "sampling/importance_sampling_ratio/mean": 0.9999610781669617, + "sampling/importance_sampling_ratio/min": 0.2764181695878506, + "sampling/sampling_logp_difference/max": 1.8616169929504394, + "sampling/sampling_logp_difference/mean": 0.013580117933452129, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.8, + "completions/max_terminated_length": 1355.8, + "completions/mean_length": 989.5875, + "completions/mean_terminated_length": 989.5875, + "completions/min_length": 734.8, + "completions/min_terminated_length": 734.8, + "entropy": 0.2569707274436951, + "epoch": 1.3337250293772032, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.862073540687561, + "learning_rate": 3.7811969953961713e-07, + "loss": -0.0004, + "num_tokens": 152581582.0, + "reward": 0.6937500238418579, + "reward_std": 0.1769823968410492, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.693750011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30896676778793336, + "sampling/importance_sampling_ratio/max": 1.9276174783706665, + "sampling/importance_sampling_ratio/mean": 1.0000139355659485, + "sampling/importance_sampling_ratio/min": 0.46765230894088744, + "sampling/sampling_logp_difference/max": 1.0491119384765626, + "sampling/sampling_logp_difference/mean": 0.013130680657923222, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1493.6, + "completions/max_terminated_length": 1493.6, + "completions/mean_length": 1019.05625, + "completions/mean_terminated_length": 1019.05625, + "completions/min_length": 723.0, + "completions/min_terminated_length": 723.0, + "entropy": 0.27715269327163694, + "epoch": 1.3396004700352526, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7062737345695496, + "learning_rate": 3.775139326387206e-07, + "loss": -0.001, + "num_tokens": 153221584.0, + "reward": 0.6836979329586029, + "reward_std": 0.11981369033455849, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6836979329586029, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2908316344022751, + "sampling/importance_sampling_ratio/max": 1.9820157051086427, + "sampling/importance_sampling_ratio/mean": 1.0000499367713929, + "sampling/importance_sampling_ratio/min": 0.2672798324376345, + "sampling/sampling_logp_difference/max": 1.7221438407897949, + "sampling/sampling_logp_difference/mean": 0.013661108911037445, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1410.2, + "completions/max_terminated_length": 1410.2, + "completions/mean_length": 1047.36875, + "completions/mean_terminated_length": 1047.36875, + "completions/min_length": 694.4, + "completions/min_terminated_length": 694.4, + "entropy": 0.28525510430336, + "epoch": 1.345475910693302, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.768993079662323, + "learning_rate": 3.7690816573782406e-07, + "loss": 0.0026, + "num_tokens": 153870806.0, + "reward": 0.64041668176651, + "reward_std": 0.11628761440515518, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.64041668176651, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2930860996246338, + "sampling/importance_sampling_ratio/max": 1.9370128154754638, + "sampling/importance_sampling_ratio/mean": 0.999804961681366, + "sampling/importance_sampling_ratio/min": 0.31624155938625337, + "sampling/sampling_logp_difference/max": 1.2107308864593507, + "sampling/sampling_logp_difference/mean": 0.013913381099700927, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1712.8, + "completions/max_terminated_length": 1712.8, + "completions/mean_length": 1118.059375, + "completions/mean_terminated_length": 1118.059375, + "completions/min_length": 778.2, + "completions/min_terminated_length": 778.2, + "entropy": 0.2776745676994324, + "epoch": 1.3513513513513513, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7642037272453308, + "learning_rate": 3.7630239883692754e-07, + "loss": -0.0087, + "num_tokens": 154553769.0, + "reward": 0.7213541746139527, + "reward_std": 0.10083994418382644, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7213541746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31691438853740694, + "sampling/importance_sampling_ratio/max": 1.981196355819702, + "sampling/importance_sampling_ratio/mean": 1.0001033902168275, + "sampling/importance_sampling_ratio/min": 0.2930480852723122, + "sampling/sampling_logp_difference/max": 1.4658790826797485, + "sampling/sampling_logp_difference/mean": 0.01348379347473383, + "step": 1150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1759.0, + "completions/max_terminated_length": 1759.0, + "completions/mean_length": 1095.1125, + "completions/mean_terminated_length": 1095.1125, + "completions/min_length": 747.2, + "completions/min_terminated_length": 747.2, + "entropy": 0.2642821192741394, + "epoch": 1.3572267920094008, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6376672983169556, + "learning_rate": 3.7569663193603103e-07, + "loss": 0.0022, + "num_tokens": 155225645.0, + "reward": 0.7658854246139526, + "reward_std": 0.10820303261280059, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7658854246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2597797363996506, + "sampling/importance_sampling_ratio/max": 1.948975110054016, + "sampling/importance_sampling_ratio/mean": 1.000026035308838, + "sampling/importance_sampling_ratio/min": 0.35516688525676726, + "sampling/sampling_logp_difference/max": 1.1820269107818604, + "sampling/sampling_logp_difference/mean": 0.013134175911545753, + "step": 1155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1399.0, + "completions/max_terminated_length": 1399.0, + "completions/mean_length": 1009.390625, + "completions/mean_terminated_length": 1009.390625, + "completions/min_length": 767.4, + "completions/min_terminated_length": 767.4, + "entropy": 0.2690662145614624, + "epoch": 1.36310223266745, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.7229093909263611, + "learning_rate": 3.7509086503513447e-07, + "loss": 0.0111, + "num_tokens": 155873482.0, + "reward": 0.7645833373069764, + "reward_std": 0.1397414982318878, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7645833373069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2538767337799072, + "sampling/importance_sampling_ratio/max": 1.9541364669799806, + "sampling/importance_sampling_ratio/mean": 1.0000741958618165, + "sampling/importance_sampling_ratio/min": 0.3407637387514114, + "sampling/sampling_logp_difference/max": 1.1103405952453613, + "sampling/sampling_logp_difference/mean": 0.013483352214097976, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1418.8, + "completions/max_terminated_length": 1418.8, + "completions/mean_length": 1008.209375, + "completions/mean_terminated_length": 1008.209375, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 0.2757707953453064, + "epoch": 1.3689776733254995, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7225468754768372, + "learning_rate": 3.7448509813423796e-07, + "loss": 0.0025, + "num_tokens": 156518173.0, + "reward": 0.6832291841506958, + "reward_std": 0.13243707567453383, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6832291841506958, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34315310418605804, + "sampling/importance_sampling_ratio/max": 1.9023520231246949, + "sampling/importance_sampling_ratio/mean": 1.0001229047775269, + "sampling/importance_sampling_ratio/min": 0.3911436438560486, + "sampling/sampling_logp_difference/max": 1.1172781705856323, + "sampling/sampling_logp_difference/mean": 0.013606655411422253, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1205.0, + "completions/max_terminated_length": 1205.0, + "completions/mean_length": 889.803125, + "completions/mean_terminated_length": 889.803125, + "completions/min_length": 617.6, + "completions/min_terminated_length": 617.6, + "entropy": 0.2621892154216766, + "epoch": 1.3748531139835487, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.5152686834335327, + "learning_rate": 3.7387933123334134e-07, + "loss": -0.0004, + "num_tokens": 157141166.0, + "reward": 0.7700520992279053, + "reward_std": 0.12003648579120636, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7700521111488342, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3090189278125763, + "sampling/importance_sampling_ratio/max": 1.9857268810272217, + "sampling/importance_sampling_ratio/mean": 1.0000556349754333, + "sampling/importance_sampling_ratio/min": 0.3500809669494629, + "sampling/sampling_logp_difference/max": 1.161958146095276, + "sampling/sampling_logp_difference/mean": 0.013260713964700698, + "step": 1170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1336.8, + "completions/max_terminated_length": 1336.8, + "completions/mean_length": 970.371875, + "completions/mean_terminated_length": 970.371875, + "completions/min_length": 635.2, + "completions/min_terminated_length": 635.2, + "entropy": 0.2712902396917343, + "epoch": 1.3807285546415982, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7717751860618591, + "learning_rate": 3.7327356433244483e-07, + "loss": 0.0001, + "num_tokens": 157777893.0, + "reward": 0.7880208492279053, + "reward_std": 0.1345105454325676, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7880208492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23644578456878662, + "sampling/importance_sampling_ratio/max": 1.9879061222076415, + "sampling/importance_sampling_ratio/mean": 1.0000129222869873, + "sampling/importance_sampling_ratio/min": 0.4100755751132965, + "sampling/sampling_logp_difference/max": 1.0043761134147644, + "sampling/sampling_logp_difference/mean": 0.013120016269385814, + "step": 1175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1377.4, + "completions/max_terminated_length": 1377.4, + "completions/mean_length": 953.33125, + "completions/mean_terminated_length": 953.33125, + "completions/min_length": 671.4, + "completions/min_terminated_length": 671.4, + "entropy": 0.2518512338399887, + "epoch": 1.3866039952996474, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.708027184009552, + "learning_rate": 3.726677974315483e-07, + "loss": 0.0097, + "num_tokens": 158411423.0, + "reward": 0.7781770944595336, + "reward_std": 0.10090606659650803, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7781770944595336, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2655190169811249, + "sampling/importance_sampling_ratio/max": 1.914062237739563, + "sampling/importance_sampling_ratio/mean": 0.9999913334846496, + "sampling/importance_sampling_ratio/min": 0.4159839451313019, + "sampling/sampling_logp_difference/max": 0.9063684225082398, + "sampling/sampling_logp_difference/mean": 0.012738440930843354, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1215.2, + "completions/max_terminated_length": 1215.2, + "completions/mean_length": 938.703125, + "completions/mean_terminated_length": 938.703125, + "completions/min_length": 698.2, + "completions/min_terminated_length": 698.2, + "entropy": 0.2818134605884552, + "epoch": 1.3924794359576969, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6890040040016174, + "learning_rate": 3.7206203053065176e-07, + "loss": 0.0017, + "num_tokens": 159030960.0, + "reward": 0.675677090883255, + "reward_std": 0.12252334356307984, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.675677090883255, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30053952932357786, + "sampling/importance_sampling_ratio/max": 1.8457768201828002, + "sampling/importance_sampling_ratio/mean": 1.0000203847885132, + "sampling/importance_sampling_ratio/min": 0.5052196741104126, + "sampling/sampling_logp_difference/max": 0.7195221900939941, + "sampling/sampling_logp_difference/mean": 0.013677260465919971, + "step": 1185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1416.8, + "completions/max_terminated_length": 1416.8, + "completions/mean_length": 1010.390625, + "completions/mean_terminated_length": 1010.390625, + "completions/min_length": 713.4, + "completions/min_terminated_length": 713.4, + "entropy": 0.2787392377853394, + "epoch": 1.398354876615746, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.38302043080329895, + "learning_rate": 3.7145626362975525e-07, + "loss": 0.0009, + "num_tokens": 159661149.0, + "reward": 0.682812511920929, + "reward_std": 0.09650907553732395, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.682812511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2945326387882233, + "sampling/importance_sampling_ratio/max": 1.927133321762085, + "sampling/importance_sampling_ratio/mean": 0.9999685287475586, + "sampling/importance_sampling_ratio/min": 0.33883661329746245, + "sampling/sampling_logp_difference/max": 1.2834124326705934, + "sampling/sampling_logp_difference/mean": 0.01364175509661436, + "step": 1190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1491.2, + "completions/max_terminated_length": 1491.2, + "completions/mean_length": 1039.9375, + "completions/mean_terminated_length": 1039.9375, + "completions/min_length": 756.6, + "completions/min_terminated_length": 756.6, + "entropy": 0.2633058696985245, + "epoch": 1.4042303172737955, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6820769906044006, + "learning_rate": 3.708504967288587e-07, + "loss": 0.0059, + "num_tokens": 160330617.0, + "reward": 0.789843761920929, + "reward_std": 0.12763621509075165, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.789843761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26849266290664675, + "sampling/importance_sampling_ratio/max": 1.9752416849136352, + "sampling/importance_sampling_ratio/mean": 1.0000136494636536, + "sampling/importance_sampling_ratio/min": 0.40743643045425415, + "sampling/sampling_logp_difference/max": 1.0446730375289917, + "sampling/sampling_logp_difference/mean": 0.012942253239452839, + "step": 1195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1338.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 932.365625, + "completions/mean_terminated_length": 932.365625, + "completions/min_length": 648.6, + "completions/min_terminated_length": 648.6, + "entropy": 0.2665301501750946, + "epoch": 1.410105757931845, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6627605557441711, + "learning_rate": 3.7024472982796217e-07, + "loss": 0.0013, + "num_tokens": 160940990.0, + "reward": 0.6481250047683715, + "reward_std": 0.10753663703799247, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6481250047683715, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36586096286773684, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000364661216736, + "sampling/importance_sampling_ratio/min": 0.38219852447509767, + "sampling/sampling_logp_difference/max": 1.015994167327881, + "sampling/sampling_logp_difference/mean": 0.013067251071333885, + "step": 1200 + }, + { + "epoch": 1.410105757931845, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1236.44, + "eval_completions/max_terminated_length": 1236.44, + "eval_completions/mean_length": 903.0825, + "eval_completions/mean_terminated_length": 903.0825, + "eval_completions/min_length": 680.32, + "eval_completions/min_terminated_length": 680.32, + "eval_entropy": 0.2593661844730377, + "eval_frac_reward_zero_std": 0.48, + "eval_loss": 0.0016279831761494279, + "eval_num_tokens": 160940990.0, + "eval_reward": 0.6931771004199981, + "eval_reward_std": 0.10766801729798317, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.6931771016120911, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31816164910793304, + "eval_runtime": 334.8567, + "eval_samples_per_second": 0.299, + "eval_sampling/importance_sampling_ratio/max": 1.915867462158203, + "eval_sampling/importance_sampling_ratio/mean": 0.9999779844284058, + "eval_sampling/importance_sampling_ratio/min": 0.39130869776010513, + "eval_sampling/sampling_logp_difference/max": 1.063571047782898, + "eval_sampling/sampling_logp_difference/mean": 0.012970775477588176, + "eval_steps_per_second": 0.006, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1300.2, + "completions/max_terminated_length": 1300.2, + "completions/mean_length": 934.4875, + "completions/mean_terminated_length": 934.4875, + "completions/min_length": 667.2, + "completions/min_terminated_length": 667.2, + "entropy": 0.2741489470005035, + "epoch": 1.4159811985898942, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8523896336555481, + "learning_rate": 3.6963896292706566e-07, + "loss": 0.0045, + "num_tokens": 161607738.0, + "reward": 0.6617187678813934, + "reward_std": 0.11894842982292175, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6617187798023224, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32751025855541227, + "sampling/importance_sampling_ratio/max": 1.9774553775787354, + "sampling/importance_sampling_ratio/mean": 1.0000733613967896, + "sampling/importance_sampling_ratio/min": 0.2799623891711235, + "sampling/sampling_logp_difference/max": 1.5475644588470459, + "sampling/sampling_logp_difference/mean": 0.013712556660175323, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 975.809375, + "completions/mean_terminated_length": 975.809375, + "completions/min_length": 724.2, + "completions/min_terminated_length": 724.2, + "entropy": 0.26065073907375336, + "epoch": 1.4218566392479435, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.6366615891456604, + "learning_rate": 3.690331960261691e-07, + "loss": -0.0025, + "num_tokens": 162238429.0, + "reward": 0.6471354246139527, + "reward_std": 0.1409156620502472, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6471354365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2605146482586861, + "sampling/importance_sampling_ratio/max": 1.9828581094741822, + "sampling/importance_sampling_ratio/mean": 0.9999633073806763, + "sampling/importance_sampling_ratio/min": 0.3066392242908478, + "sampling/sampling_logp_difference/max": 1.2085018634796143, + "sampling/sampling_logp_difference/mean": 0.013046330399811268, + "step": 1210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1418.6, + "completions/max_terminated_length": 1418.6, + "completions/mean_length": 965.3125, + "completions/mean_terminated_length": 965.3125, + "completions/min_length": 718.8, + "completions/min_terminated_length": 718.8, + "entropy": 0.2614882171154022, + "epoch": 1.427732079905993, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6511854529380798, + "learning_rate": 3.684274291252726e-07, + "loss": 0.0002, + "num_tokens": 162839185.0, + "reward": 0.7278646111488343, + "reward_std": 0.08626341633498669, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7278646111488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3056270956993103, + "sampling/importance_sampling_ratio/max": 1.926198172569275, + "sampling/importance_sampling_ratio/mean": 0.9999306440353394, + "sampling/importance_sampling_ratio/min": 0.3832464128732681, + "sampling/sampling_logp_difference/max": 1.1059164881706238, + "sampling/sampling_logp_difference/mean": 0.012989461980760097, + "step": 1215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1518.6, + "completions/max_terminated_length": 1325.4, + "completions/mean_length": 996.95, + "completions/mean_terminated_length": 992.1390991210938, + "completions/min_length": 739.6, + "completions/min_terminated_length": 739.6, + "entropy": 0.2724646270275116, + "epoch": 1.4336075205640424, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3615824580192566, + "learning_rate": 3.67821662224376e-07, + "loss": -0.008, + "num_tokens": 163491181.0, + "reward": 0.73125, + "reward_std": 0.09822167605161666, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.73125, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2837597757577896, + "sampling/importance_sampling_ratio/max": 1.8505266666412354, + "sampling/importance_sampling_ratio/mean": 0.9999452590942383, + "sampling/importance_sampling_ratio/min": 0.4435959100723267, + "sampling/sampling_logp_difference/max": 1.5051439166069032, + "sampling/sampling_logp_difference/mean": 0.013260528817772866, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1521.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 1007.390625, + "completions/mean_terminated_length": 1007.390625, + "completions/min_length": 767.8, + "completions/min_terminated_length": 767.8, + "entropy": 0.26534418761730194, + "epoch": 1.4394829612220916, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0, + "learning_rate": 3.672158953234795e-07, + "loss": 0.0022, + "num_tokens": 164151914.0, + "reward": 0.7981250166893006, + "reward_std": 0.1127402737736702, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7981250166893006, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2560457527637482, + "sampling/importance_sampling_ratio/max": 1.9135950803756714, + "sampling/importance_sampling_ratio/mean": 0.999986755847931, + "sampling/importance_sampling_ratio/min": 0.303699953854084, + "sampling/sampling_logp_difference/max": 1.3842716932296752, + "sampling/sampling_logp_difference/mean": 0.013141075521707535, + "step": 1225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1462.2, + "completions/max_terminated_length": 1462.2, + "completions/mean_length": 1018.825, + "completions/mean_terminated_length": 1018.825, + "completions/min_length": 698.2, + "completions/min_terminated_length": 698.2, + "entropy": 0.2592238187789917, + "epoch": 1.445358401880141, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.595700740814209, + "learning_rate": 3.66610128422583e-07, + "loss": -0.0046, + "num_tokens": 164830738.0, + "reward": 0.6343229413032532, + "reward_std": 0.10299613662064075, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6343229413032532, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3512304097414017, + "sampling/importance_sampling_ratio/max": 1.9878422021865845, + "sampling/importance_sampling_ratio/mean": 0.9999564051628113, + "sampling/importance_sampling_ratio/min": 0.35334097146987914, + "sampling/sampling_logp_difference/max": 1.1131124019622802, + "sampling/sampling_logp_difference/mean": 0.012872187793254853, + "step": 1230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1467.4, + "completions/max_terminated_length": 1467.4, + "completions/mean_length": 993.9, + "completions/mean_terminated_length": 993.9, + "completions/min_length": 724.6, + "completions/min_terminated_length": 724.6, + "entropy": 0.2649468719959259, + "epoch": 1.4512338425381903, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6314127445220947, + "learning_rate": 3.6600436152168644e-07, + "loss": -0.0026, + "num_tokens": 165447458.0, + "reward": 0.7947916865348816, + "reward_std": 0.1078619197010994, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7947916865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19745177924633026, + "sampling/importance_sampling_ratio/max": 1.977701473236084, + "sampling/importance_sampling_ratio/mean": 0.9999303340911865, + "sampling/importance_sampling_ratio/min": 0.2354953714646399, + "sampling/sampling_logp_difference/max": 2.2229801654815673, + "sampling/sampling_logp_difference/mean": 0.012961567752063275, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1326.6, + "completions/max_terminated_length": 1326.6, + "completions/mean_length": 996.71875, + "completions/mean_terminated_length": 996.71875, + "completions/min_length": 760.4, + "completions/min_terminated_length": 760.4, + "entropy": 0.2625953197479248, + "epoch": 1.4571092831962398, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.41131144762039185, + "learning_rate": 3.6539859462078993e-07, + "loss": 0.001, + "num_tokens": 166096216.0, + "reward": 0.7678125143051148, + "reward_std": 0.09220594316720962, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7678125143051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23345414102077483, + "sampling/importance_sampling_ratio/max": 1.895510697364807, + "sampling/importance_sampling_ratio/mean": 0.999940812587738, + "sampling/importance_sampling_ratio/min": 0.4072090119123459, + "sampling/sampling_logp_difference/max": 0.9523330926895142, + "sampling/sampling_logp_difference/mean": 0.01279841959476471, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1585.8, + "completions/max_terminated_length": 1548.8, + "completions/mean_length": 1040.771875, + "completions/mean_terminated_length": 1036.761669921875, + "completions/min_length": 738.6, + "completions/min_terminated_length": 738.6, + "entropy": 0.25499052703380587, + "epoch": 1.462984723854289, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5272420048713684, + "learning_rate": 3.647928277198934e-07, + "loss": 0.0015, + "num_tokens": 166732619.0, + "reward": 0.7338541924953461, + "reward_std": 0.1370665103197098, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7338541924953461, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2300163447856903, + "sampling/importance_sampling_ratio/max": 1.9345684766769409, + "sampling/importance_sampling_ratio/mean": 1.0000292301177978, + "sampling/importance_sampling_ratio/min": 0.2501420438289642, + "sampling/sampling_logp_difference/max": 1.5750776767730712, + "sampling/sampling_logp_difference/mean": 0.01261440571397543, + "step": 1245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1672.4, + "completions/max_terminated_length": 1648.4, + "completions/mean_length": 1038.628125, + "completions/mean_terminated_length": 1026.3600830078126, + "completions/min_length": 766.8, + "completions/min_terminated_length": 766.8, + "entropy": 0.2597430557012558, + "epoch": 1.4688601645123385, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6333122253417969, + "learning_rate": 3.6418706081899685e-07, + "loss": -0.0051, + "num_tokens": 167379848.0, + "reward": 0.7830729246139526, + "reward_std": 0.09931705892086029, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7830729246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27932691723108294, + "sampling/importance_sampling_ratio/max": 1.9465564727783202, + "sampling/importance_sampling_ratio/mean": 1.0000145792961121, + "sampling/importance_sampling_ratio/min": 0.33197267055511476, + "sampling/sampling_logp_difference/max": 1.224328637123108, + "sampling/sampling_logp_difference/mean": 0.012780552357435226, + "step": 1250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1333.4, + "completions/max_terminated_length": 1333.4, + "completions/mean_length": 967.034375, + "completions/mean_terminated_length": 967.034375, + "completions/min_length": 714.6, + "completions/min_terminated_length": 714.6, + "entropy": 0.26842235326766967, + "epoch": 1.4747356051703877, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6638708710670471, + "learning_rate": 3.635812939181003e-07, + "loss": 0.0041, + "num_tokens": 168031283.0, + "reward": 0.784583330154419, + "reward_std": 0.07790126055479049, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.784583330154419, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27794828414916994, + "sampling/importance_sampling_ratio/max": 1.9674632549285889, + "sampling/importance_sampling_ratio/mean": 0.9999781250953674, + "sampling/importance_sampling_ratio/min": 0.32500605285167694, + "sampling/sampling_logp_difference/max": 1.4220096588134765, + "sampling/sampling_logp_difference/mean": 0.013222084194421769, + "step": 1255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1456.6, + "completions/max_terminated_length": 1456.6, + "completions/mean_length": 1003.51875, + "completions/mean_terminated_length": 1003.51875, + "completions/min_length": 712.6, + "completions/min_terminated_length": 712.6, + "entropy": 0.2660016596317291, + "epoch": 1.4806110458284372, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.707177996635437, + "learning_rate": 3.6297552701720373e-07, + "loss": -0.0025, + "num_tokens": 168686857.0, + "reward": 0.7382812619209289, + "reward_std": 0.13373910933732985, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7382812619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30144949853420255, + "sampling/importance_sampling_ratio/max": 1.9266873836517333, + "sampling/importance_sampling_ratio/mean": 1.0000134468078614, + "sampling/importance_sampling_ratio/min": 0.26473745703697205, + "sampling/sampling_logp_difference/max": 1.5025355339050293, + "sampling/sampling_logp_difference/mean": 0.013266277499496937, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1320.4, + "completions/max_terminated_length": 1320.4, + "completions/mean_length": 961.946875, + "completions/mean_terminated_length": 961.946875, + "completions/min_length": 756.8, + "completions/min_terminated_length": 756.8, + "entropy": 0.27312275767326355, + "epoch": 1.4864864864864864, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7554060816764832, + "learning_rate": 3.623697601163072e-07, + "loss": 0.0003, + "num_tokens": 169318056.0, + "reward": 0.8033854246139527, + "reward_std": 0.10426819771528244, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8033854246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2865228235721588, + "sampling/importance_sampling_ratio/max": 1.8720565795898438, + "sampling/importance_sampling_ratio/mean": 0.9999788880348206, + "sampling/importance_sampling_ratio/min": 0.4627742886543274, + "sampling/sampling_logp_difference/max": 0.8917694330215454, + "sampling/sampling_logp_difference/mean": 0.01329927183687687, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1299.0, + "completions/max_terminated_length": 1299.0, + "completions/mean_length": 990.95, + "completions/mean_terminated_length": 990.95, + "completions/min_length": 752.8, + "completions/min_terminated_length": 752.8, + "entropy": 0.26616363525390624, + "epoch": 1.4923619271445359, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5357159376144409, + "learning_rate": 3.6176399321541065e-07, + "loss": 0.0009, + "num_tokens": 169984952.0, + "reward": 0.7421875357627868, + "reward_std": 0.11765087842941284, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7421875357627868, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28093287646770476, + "sampling/importance_sampling_ratio/max": 1.8472656726837158, + "sampling/importance_sampling_ratio/mean": 1.000068485736847, + "sampling/importance_sampling_ratio/min": 0.4106867015361786, + "sampling/sampling_logp_difference/max": 0.9185904502868653, + "sampling/sampling_logp_difference/mean": 0.013041174784302712, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1387.0, + "completions/max_terminated_length": 1387.0, + "completions/mean_length": 987.8, + "completions/mean_terminated_length": 987.8, + "completions/min_length": 690.6, + "completions/min_terminated_length": 690.6, + "entropy": 0.2711699903011322, + "epoch": 1.4982373678025853, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6310663223266602, + "learning_rate": 3.6115822631451414e-07, + "loss": -0.0026, + "num_tokens": 170648824.0, + "reward": 0.6416666805744171, + "reward_std": 0.1222050666809082, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6416666805744171, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2899262696504593, + "sampling/importance_sampling_ratio/max": 1.9429375410079956, + "sampling/importance_sampling_ratio/mean": 1.000006639957428, + "sampling/importance_sampling_ratio/min": 0.33936918079853057, + "sampling/sampling_logp_difference/max": 1.1747238516807557, + "sampling/sampling_logp_difference/mean": 0.013466325402259827, + "step": 1275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1381.8, + "completions/max_terminated_length": 1381.8, + "completions/mean_length": 959.015625, + "completions/mean_terminated_length": 959.015625, + "completions/min_length": 684.0, + "completions/min_terminated_length": 684.0, + "entropy": 0.26730274558067324, + "epoch": 1.5041128084606346, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.43127650022506714, + "learning_rate": 3.6055245941361763e-07, + "loss": 0.0017, + "num_tokens": 171280445.0, + "reward": 0.7687500059604645, + "reward_std": 0.11303048729896545, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7687500059604645, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2574013233184814, + "sampling/importance_sampling_ratio/max": 1.9692850351333617, + "sampling/importance_sampling_ratio/mean": 0.9999760866165162, + "sampling/importance_sampling_ratio/min": 0.3867207020521164, + "sampling/sampling_logp_difference/max": 1.179562497138977, + "sampling/sampling_logp_difference/mean": 0.013569790497422218, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1353.8, + "completions/max_terminated_length": 1353.8, + "completions/mean_length": 977.803125, + "completions/mean_terminated_length": 977.803125, + "completions/min_length": 769.2, + "completions/min_terminated_length": 769.2, + "entropy": 0.2703259289264679, + "epoch": 1.5099882491186838, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5339919924736023, + "learning_rate": 3.5994669251272107e-07, + "loss": -0.0022, + "num_tokens": 171940318.0, + "reward": 0.8276041984558106, + "reward_std": 0.10052161514759064, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8276041984558106, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24535318613052368, + "sampling/importance_sampling_ratio/max": 1.8749458074569703, + "sampling/importance_sampling_ratio/mean": 0.9999295115470886, + "sampling/importance_sampling_ratio/min": 0.41402388215065, + "sampling/sampling_logp_difference/max": 0.8982853889465332, + "sampling/sampling_logp_difference/mean": 0.0132589066401124, + "step": 1285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1344.6, + "completions/max_terminated_length": 1344.6, + "completions/mean_length": 993.3, + "completions/mean_terminated_length": 993.3, + "completions/min_length": 683.2, + "completions/min_terminated_length": 683.2, + "entropy": 0.2562386393547058, + "epoch": 1.5158636897767332, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6523430943489075, + "learning_rate": 3.5934092561182456e-07, + "loss": 0.0026, + "num_tokens": 172547102.0, + "reward": 0.7180729448795319, + "reward_std": 0.11666595637798309, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7180729448795319, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30029604732990267, + "sampling/importance_sampling_ratio/max": 1.8882590532302856, + "sampling/importance_sampling_ratio/mean": 1.0000897526741028, + "sampling/importance_sampling_ratio/min": 0.380884712934494, + "sampling/sampling_logp_difference/max": 0.9894267439842224, + "sampling/sampling_logp_difference/mean": 0.012895303219556809, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1410.6, + "completions/max_terminated_length": 1410.6, + "completions/mean_length": 1026.284375, + "completions/mean_terminated_length": 1026.284375, + "completions/min_length": 671.2, + "completions/min_terminated_length": 671.2, + "entropy": 0.2772479742765427, + "epoch": 1.5217391304347827, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8672804832458496, + "learning_rate": 3.5873515871092805e-07, + "loss": 0.0024, + "num_tokens": 173226025.0, + "reward": 0.8010937690734863, + "reward_std": 0.14239197373390197, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8010937690734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2529195070266724, + "sampling/importance_sampling_ratio/max": 1.9743723630905152, + "sampling/importance_sampling_ratio/mean": 1.0001447439193725, + "sampling/importance_sampling_ratio/min": 0.3725964456796646, + "sampling/sampling_logp_difference/max": 1.0943672776222229, + "sampling/sampling_logp_difference/mean": 0.013937021978199482, + "step": 1295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1239.2, + "completions/max_terminated_length": 1239.2, + "completions/mean_length": 953.140625, + "completions/mean_terminated_length": 953.140625, + "completions/min_length": 679.6, + "completions/min_terminated_length": 679.6, + "entropy": 0.25364493727684023, + "epoch": 1.527614571092832, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.3933258652687073, + "learning_rate": 3.581293918100315e-07, + "loss": 0.0064, + "num_tokens": 173848950.0, + "reward": 0.7255208492279053, + "reward_std": 0.10795222967863083, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7255208492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31787977516651156, + "sampling/importance_sampling_ratio/max": 1.8450412988662719, + "sampling/importance_sampling_ratio/mean": 0.9999418258666992, + "sampling/importance_sampling_ratio/min": 0.29344726353883743, + "sampling/sampling_logp_difference/max": 1.453646445274353, + "sampling/sampling_logp_difference/mean": 0.01269476506859064, + "step": 1300 + }, + { + "epoch": 1.527614571092832, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1382.28, + "eval_completions/max_terminated_length": 1382.28, + "eval_completions/mean_length": 965.419375, + "eval_completions/mean_terminated_length": 965.419375, + "eval_completions/min_length": 709.76, + "eval_completions/min_terminated_length": 709.76, + "eval_entropy": 0.26879385590553284, + "eval_frac_reward_zero_std": 0.49, + "eval_loss": 0.004258011933416128, + "eval_num_tokens": 173848950.0, + "eval_reward": 0.7047604262828827, + "eval_reward_std": 0.10310830242931843, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7047604262828827, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3197030121088028, + "eval_runtime": 367.9737, + "eval_samples_per_second": 0.272, + "eval_sampling/importance_sampling_ratio/max": 1.9342151737213136, + "eval_sampling/importance_sampling_ratio/mean": 1.0000292944908142, + "eval_sampling/importance_sampling_ratio/min": 0.37101240634918214, + "eval_sampling/sampling_logp_difference/max": 1.128817195892334, + "eval_sampling/sampling_logp_difference/mean": 0.013480741195380688, + "eval_steps_per_second": 0.005, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1578.0, + "completions/max_terminated_length": 1578.0, + "completions/mean_length": 1029.65625, + "completions/mean_terminated_length": 1029.65625, + "completions/min_length": 694.6, + "completions/min_terminated_length": 694.6, + "entropy": 0.2900865375995636, + "epoch": 1.5334900117508812, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8898422718048096, + "learning_rate": 3.5752362490913497e-07, + "loss": 0.0011, + "num_tokens": 174539704.0, + "reward": 0.6721354484558105, + "reward_std": 0.12114289328455925, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6721354484558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30387861728668214, + "sampling/importance_sampling_ratio/max": 1.9696703195571899, + "sampling/importance_sampling_ratio/mean": 1.0000692009925842, + "sampling/importance_sampling_ratio/min": 0.42663750648498533, + "sampling/sampling_logp_difference/max": 0.9719892978668213, + "sampling/sampling_logp_difference/mean": 0.0140638317912817, + "step": 1305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1501.8, + "completions/max_terminated_length": 1501.8, + "completions/mean_length": 1011.9, + "completions/mean_terminated_length": 1011.9, + "completions/min_length": 737.4, + "completions/min_terminated_length": 737.4, + "entropy": 0.27017735242843627, + "epoch": 1.5393654524089306, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8569518327713013, + "learning_rate": 3.569178580082384e-07, + "loss": -0.0016, + "num_tokens": 175190648.0, + "reward": 0.7643229365348816, + "reward_std": 0.11197378635406494, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7643229365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24610742479562758, + "sampling/importance_sampling_ratio/max": 1.9556582927703858, + "sampling/importance_sampling_ratio/mean": 0.9999646782875061, + "sampling/importance_sampling_ratio/min": 0.31404358744621275, + "sampling/sampling_logp_difference/max": 1.3031816244125367, + "sampling/sampling_logp_difference/mean": 0.013381559960544109, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1600.8, + "completions/max_terminated_length": 1600.8, + "completions/mean_length": 1084.4625, + "completions/mean_terminated_length": 1084.4625, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "entropy": 0.27942303419113157, + "epoch": 1.54524089306698, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.8738449215888977, + "learning_rate": 3.563120911073419e-07, + "loss": -0.0044, + "num_tokens": 175877324.0, + "reward": 0.6853646039962769, + "reward_std": 0.15504504293203353, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6853646039962769, + "rewards/e2e_recall_precision_mixed_reward/std": 0.33614162504673006, + "sampling/importance_sampling_ratio/max": 1.9305254220962524, + "sampling/importance_sampling_ratio/mean": 0.9999444603919982, + "sampling/importance_sampling_ratio/min": 0.3318605124950409, + "sampling/sampling_logp_difference/max": 1.1842295169830321, + "sampling/sampling_logp_difference/mean": 0.01380113661289215, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1365.0, + "completions/max_terminated_length": 1365.0, + "completions/mean_length": 962.596875, + "completions/mean_terminated_length": 962.596875, + "completions/min_length": 651.4, + "completions/min_terminated_length": 651.4, + "entropy": 0.2730784237384796, + "epoch": 1.5511163337250293, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.9576377868652344, + "learning_rate": 3.557063242064454e-07, + "loss": -0.0066, + "num_tokens": 176521867.0, + "reward": 0.7044270992279053, + "reward_std": 0.1317270040512085, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7044270992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32034913301467893, + "sampling/importance_sampling_ratio/max": 1.9374944686889648, + "sampling/importance_sampling_ratio/mean": 0.9999537825584411, + "sampling/importance_sampling_ratio/min": 0.37277138531208037, + "sampling/sampling_logp_difference/max": 1.428537940979004, + "sampling/sampling_logp_difference/mean": 0.013699793815612793, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1501.0, + "completions/max_terminated_length": 1434.8, + "completions/mean_length": 935.81875, + "completions/mean_terminated_length": 931.1066040039062, + "completions/min_length": 664.6, + "completions/min_terminated_length": 664.6, + "entropy": 0.25443568229675295, + "epoch": 1.5569917743830788, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6914281845092773, + "learning_rate": 3.551005573055488e-07, + "loss": -0.0028, + "num_tokens": 177138973.0, + "reward": 0.7851562619209289, + "reward_std": 0.10055364742875099, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7851562619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2937077939510345, + "sampling/importance_sampling_ratio/max": 1.9170387506484985, + "sampling/importance_sampling_ratio/mean": 1.0001505136489868, + "sampling/importance_sampling_ratio/min": 0.3506089061498642, + "sampling/sampling_logp_difference/max": 1.0952062368392945, + "sampling/sampling_logp_difference/mean": 0.01276035774499178, + "step": 1325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1556.4, + "completions/max_terminated_length": 1556.4, + "completions/mean_length": 1025.075, + "completions/mean_terminated_length": 1025.075, + "completions/min_length": 752.4, + "completions/min_terminated_length": 752.4, + "entropy": 0.2668500870466232, + "epoch": 1.5628672150411282, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.806428849697113, + "learning_rate": 3.544947904046523e-07, + "loss": -0.0, + "num_tokens": 177795797.0, + "reward": 0.7680729389190674, + "reward_std": 0.08650054633617402, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7680729389190674, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27004733085632326, + "sampling/importance_sampling_ratio/max": 1.954677987098694, + "sampling/importance_sampling_ratio/mean": 1.0000656127929688, + "sampling/importance_sampling_ratio/min": 0.35064939856529237, + "sampling/sampling_logp_difference/max": 1.1046170234680175, + "sampling/sampling_logp_difference/mean": 0.013290046527981759, + "step": 1330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1229.8, + "completions/max_terminated_length": 1229.8, + "completions/mean_length": 947.878125, + "completions/mean_terminated_length": 947.878125, + "completions/min_length": 675.6, + "completions/min_terminated_length": 675.6, + "entropy": 0.27370848655700686, + "epoch": 1.5687426556991775, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.763029932975769, + "learning_rate": 3.538890235037557e-07, + "loss": -0.0025, + "num_tokens": 178420750.0, + "reward": 0.7265625238418579, + "reward_std": 0.14514898210763932, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7265625238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.289459753036499, + "sampling/importance_sampling_ratio/max": 1.9520225524902344, + "sampling/importance_sampling_ratio/mean": 1.0000372052192688, + "sampling/importance_sampling_ratio/min": 0.31350120902112816, + "sampling/sampling_logp_difference/max": 6.1525186419487, + "sampling/sampling_logp_difference/mean": 0.013847914896905423, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1468.6, + "completions/max_terminated_length": 1468.6, + "completions/mean_length": 1013.05625, + "completions/mean_terminated_length": 1013.05625, + "completions/min_length": 697.4, + "completions/min_terminated_length": 697.4, + "entropy": 0.2661381125450134, + "epoch": 1.5746180963572267, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.8375189900398254, + "learning_rate": 3.532832566028592e-07, + "loss": 0.0031, + "num_tokens": 179052032.0, + "reward": 0.8404687762260437, + "reward_std": 0.09039057418704033, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8404687762260437, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2524672240018845, + "sampling/importance_sampling_ratio/max": 1.9079517602920533, + "sampling/importance_sampling_ratio/mean": 0.999993360042572, + "sampling/importance_sampling_ratio/min": 0.3867530390620232, + "sampling/sampling_logp_difference/max": 1.1689835071563721, + "sampling/sampling_logp_difference/mean": 0.013322325237095357, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1527.4, + "completions/max_terminated_length": 1341.0, + "completions/mean_length": 1034.946875, + "completions/mean_terminated_length": 1030.3750732421875, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "entropy": 0.261656728386879, + "epoch": 1.5804935370152762, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.13206477463245392, + "learning_rate": 3.526774897019627e-07, + "loss": -0.0105, + "num_tokens": 179727691.0, + "reward": 0.7455729246139526, + "reward_std": 0.10084965825080872, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7455729246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23413763344287872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999989628791809, + "sampling/importance_sampling_ratio/min": 0.38248581886291505, + "sampling/sampling_logp_difference/max": 1.209545373916626, + "sampling/sampling_logp_difference/mean": 0.013246373273432255, + "step": 1345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1276.8, + "completions/max_terminated_length": 1276.8, + "completions/mean_length": 938.6, + "completions/mean_terminated_length": 938.6, + "completions/min_length": 676.2, + "completions/min_terminated_length": 676.2, + "entropy": 0.2708220988512039, + "epoch": 1.5863689776733256, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.8071889877319336, + "learning_rate": 3.520717228010661e-07, + "loss": 0.0029, + "num_tokens": 180349259.0, + "reward": 0.7190104246139526, + "reward_std": 0.09788908958435058, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7190104246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30650861859321593, + "sampling/importance_sampling_ratio/max": 1.958139681816101, + "sampling/importance_sampling_ratio/mean": 1.0000072836875915, + "sampling/importance_sampling_ratio/min": 0.4504124343395233, + "sampling/sampling_logp_difference/max": 0.8907589435577392, + "sampling/sampling_logp_difference/mean": 0.013682788796722889, + "step": 1350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1418.0, + "completions/max_terminated_length": 1418.0, + "completions/mean_length": 950.903125, + "completions/mean_terminated_length": 950.903125, + "completions/min_length": 681.2, + "completions/min_terminated_length": 681.2, + "entropy": 0.2506708770990372, + "epoch": 1.5922444183313749, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.7698414921760559, + "learning_rate": 3.514659559001696e-07, + "loss": 0.0022, + "num_tokens": 180941676.0, + "reward": 0.8494791746139526, + "reward_std": 0.08073322921991348, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8494791746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22139372527599335, + "sampling/importance_sampling_ratio/max": 1.8772455215454102, + "sampling/importance_sampling_ratio/mean": 1.0000471472740173, + "sampling/importance_sampling_ratio/min": 0.28111872524023057, + "sampling/sampling_logp_difference/max": 1.3952457189559937, + "sampling/sampling_logp_difference/mean": 0.012653507106006146, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1521.0, + "completions/max_terminated_length": 1521.0, + "completions/mean_length": 1009.0875, + "completions/mean_terminated_length": 1009.0875, + "completions/min_length": 764.6, + "completions/min_terminated_length": 764.6, + "entropy": 0.258016636967659, + "epoch": 1.598119858989424, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8082835674285889, + "learning_rate": 3.5086018899927304e-07, + "loss": 0.0038, + "num_tokens": 181581192.0, + "reward": 0.6520833492279052, + "reward_std": 0.10948452875018119, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6520833492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36065220832824707, + "sampling/importance_sampling_ratio/max": 1.9668825149536133, + "sampling/importance_sampling_ratio/mean": 0.999875009059906, + "sampling/importance_sampling_ratio/min": 0.26854347884655, + "sampling/sampling_logp_difference/max": 1.6615525960922242, + "sampling/sampling_logp_difference/mean": 0.013133746571838856, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1402.0, + "completions/max_terminated_length": 1402.0, + "completions/mean_length": 964.73125, + "completions/mean_terminated_length": 964.73125, + "completions/min_length": 638.8, + "completions/min_terminated_length": 638.8, + "entropy": 0.27279475927352903, + "epoch": 1.6039952996474736, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.37393927574157715, + "learning_rate": 3.5025442209837653e-07, + "loss": -0.0099, + "num_tokens": 182214178.0, + "reward": 0.6559896111488343, + "reward_std": 0.1184864416718483, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6559896111488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34022014737129214, + "sampling/importance_sampling_ratio/max": 1.981991744041443, + "sampling/importance_sampling_ratio/mean": 1.0000012278556825, + "sampling/importance_sampling_ratio/min": 0.39412103295326234, + "sampling/sampling_logp_difference/max": 0.9828832149505615, + "sampling/sampling_logp_difference/mean": 0.013542711734771729, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1341.8, + "completions/max_terminated_length": 1341.8, + "completions/mean_length": 955.240625, + "completions/mean_terminated_length": 955.240625, + "completions/min_length": 721.2, + "completions/min_terminated_length": 721.2, + "entropy": 0.26667892932891846, + "epoch": 1.609870740305523, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4775325059890747, + "learning_rate": 3.4964865519748e-07, + "loss": 0.0031, + "num_tokens": 182863567.0, + "reward": 0.6739583492279053, + "reward_std": 0.10213126838207245, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6739583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.4062064468860626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000226378440857, + "sampling/importance_sampling_ratio/min": 0.44252710342407225, + "sampling/sampling_logp_difference/max": 1.2520951271057128, + "sampling/sampling_logp_difference/mean": 0.013411963172256947, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1326.6, + "completions/max_terminated_length": 1326.6, + "completions/mean_length": 983.440625, + "completions/mean_terminated_length": 983.440625, + "completions/min_length": 761.8, + "completions/min_terminated_length": 761.8, + "entropy": 0.2573852360248566, + "epoch": 1.6157461809635723, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.726811945438385, + "learning_rate": 3.4904288829658345e-07, + "loss": -0.0038, + "num_tokens": 183484732.0, + "reward": 0.7614062547683715, + "reward_std": 0.12323106527328491, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7614062547683715, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28980146944522855, + "sampling/importance_sampling_ratio/max": 1.8720725297927856, + "sampling/importance_sampling_ratio/mean": 1.0000526189804078, + "sampling/importance_sampling_ratio/min": 0.4041356325149536, + "sampling/sampling_logp_difference/max": 1.0532057285308838, + "sampling/sampling_logp_difference/mean": 0.012885104678571225, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1204.4, + "completions/max_terminated_length": 1204.4, + "completions/mean_length": 883.18125, + "completions/mean_terminated_length": 883.18125, + "completions/min_length": 621.2, + "completions/min_terminated_length": 621.2, + "entropy": 0.26330329179763795, + "epoch": 1.6216216216216215, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.5486319065093994, + "learning_rate": 3.4843712139568694e-07, + "loss": -0.001, + "num_tokens": 184108406.0, + "reward": 0.6694270968437195, + "reward_std": 0.14080710634589194, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6694271087646484, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3262661784887314, + "sampling/importance_sampling_ratio/max": 1.9142040491104126, + "sampling/importance_sampling_ratio/mean": 0.9999730348587036, + "sampling/importance_sampling_ratio/min": 0.2835851192474365, + "sampling/sampling_logp_difference/max": 1.327359104156494, + "sampling/sampling_logp_difference/mean": 0.013574123941361904, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1382.6, + "completions/max_terminated_length": 1382.6, + "completions/mean_length": 973.8375, + "completions/mean_terminated_length": 973.8375, + "completions/min_length": 697.4, + "completions/min_terminated_length": 697.4, + "entropy": 0.25520346462726595, + "epoch": 1.627497062279671, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6697659492492676, + "learning_rate": 3.478313544947904e-07, + "loss": 0.0007, + "num_tokens": 184745202.0, + "reward": 0.8057291984558106, + "reward_std": 0.10847726836800575, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8057291984558106, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2511609926819801, + "sampling/importance_sampling_ratio/max": 1.9274680614471436, + "sampling/importance_sampling_ratio/mean": 0.9999136447906494, + "sampling/importance_sampling_ratio/min": 0.3132738881278783, + "sampling/sampling_logp_difference/max": 2.042408013343811, + "sampling/sampling_logp_difference/mean": 0.013148021697998048, + "step": 1385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.2, + "completions/max_terminated_length": 1343.2, + "completions/mean_length": 983.940625, + "completions/mean_terminated_length": 983.940625, + "completions/min_length": 770.4, + "completions/min_terminated_length": 770.4, + "entropy": 0.24816880524158477, + "epoch": 1.6333725029377204, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.8195874094963074, + "learning_rate": 3.4722558759389387e-07, + "loss": -0.0063, + "num_tokens": 185400591.0, + "reward": 0.7890625119209289, + "reward_std": 0.05731135383248329, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7890625119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.274530765414238, + "sampling/importance_sampling_ratio/max": 1.9616381645202636, + "sampling/importance_sampling_ratio/mean": 0.9999887347221375, + "sampling/importance_sampling_ratio/min": 0.32951097935438156, + "sampling/sampling_logp_difference/max": 1.2189922094345094, + "sampling/sampling_logp_difference/mean": 0.012603016383945943, + "step": 1390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1579.6, + "completions/max_terminated_length": 1579.6, + "completions/mean_length": 1055.6875, + "completions/mean_terminated_length": 1055.6875, + "completions/min_length": 822.2, + "completions/min_terminated_length": 822.2, + "entropy": 0.280494225025177, + "epoch": 1.6392479435957696, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.8461142778396606, + "learning_rate": 3.4661982069299736e-07, + "loss": 0.0024, + "num_tokens": 186054043.0, + "reward": 0.6911458611488343, + "reward_std": 0.13243722468614577, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6911458611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30035166144371034, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000493407249451, + "sampling/importance_sampling_ratio/min": 0.3128639668226242, + "sampling/sampling_logp_difference/max": 1.3867409229278564, + "sampling/sampling_logp_difference/mean": 0.013728627003729343, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1480.4, + "completions/max_terminated_length": 1480.4, + "completions/mean_length": 1067.021875, + "completions/mean_terminated_length": 1067.021875, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.26939132809638977, + "epoch": 1.6451233842538189, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.6929203271865845, + "learning_rate": 3.460140537921008e-07, + "loss": 0.0062, + "num_tokens": 186696866.0, + "reward": 0.7593750119209289, + "reward_std": 0.14759771823883056, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7593750119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3351838618516922, + "sampling/importance_sampling_ratio/max": 1.9693204164505005, + "sampling/importance_sampling_ratio/mean": 0.9999853372573853, + "sampling/importance_sampling_ratio/min": 0.2552876703441143, + "sampling/sampling_logp_difference/max": 1.6299549341201782, + "sampling/sampling_logp_difference/mean": 0.013131172768771649, + "step": 1400 + }, + { + "epoch": 1.6451233842538189, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1447.16, + "eval_completions/max_terminated_length": 1447.16, + "eval_completions/mean_length": 1013.66375, + "eval_completions/mean_terminated_length": 1013.66375, + "eval_completions/min_length": 747.24, + "eval_completions/min_terminated_length": 747.24, + "eval_entropy": 0.27061537742614744, + "eval_frac_reward_zero_std": 0.47, + "eval_loss": 0.0014171568909659982, + "eval_num_tokens": 186696866.0, + "eval_reward": 0.7115208458900452, + "eval_reward_std": 0.1059424777328968, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7115208458900452, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3199702352285385, + "eval_runtime": 391.0468, + "eval_samples_per_second": 0.256, + "eval_sampling/importance_sampling_ratio/max": 1.9688294887542725, + "eval_sampling/importance_sampling_ratio/mean": 0.9999658560752869, + "eval_sampling/importance_sampling_ratio/min": 0.3690611620247364, + "eval_sampling/sampling_logp_difference/max": 1.168926215171814, + "eval_sampling/sampling_logp_difference/mean": 0.013416541777551173, + "eval_steps_per_second": 0.005, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1724.6, + "completions/max_terminated_length": 1675.6, + "completions/mean_length": 1076.90625, + "completions/mean_terminated_length": 1072.59599609375, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 0.2762373864650726, + "epoch": 1.6509988249118686, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.559288501739502, + "learning_rate": 3.454082868912043e-07, + "loss": -0.0008, + "num_tokens": 187377088.0, + "reward": 0.6973958492279053, + "reward_std": 0.11119076311588287, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6973958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3615538030862808, + "sampling/importance_sampling_ratio/max": 1.9869262456893921, + "sampling/importance_sampling_ratio/mean": 1.0000603914260864, + "sampling/importance_sampling_ratio/min": 0.1473228994058445, + "sampling/sampling_logp_difference/max": 2.740011477470398, + "sampling/sampling_logp_difference/mean": 0.013979729451239108, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1452.4, + "completions/max_terminated_length": 1452.4, + "completions/mean_length": 1081.03125, + "completions/mean_terminated_length": 1081.03125, + "completions/min_length": 809.8, + "completions/min_terminated_length": 809.8, + "entropy": 0.263926637172699, + "epoch": 1.6568742655699178, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8976117372512817, + "learning_rate": 3.448025199903077e-07, + "loss": 0.0007, + "num_tokens": 188023354.0, + "reward": 0.8164062619209289, + "reward_std": 0.12308143377304077, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8164062619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23725470006465912, + "sampling/importance_sampling_ratio/max": 1.9938726425170898, + "sampling/importance_sampling_ratio/mean": 1.0001226782798767, + "sampling/importance_sampling_ratio/min": 0.27841649786059863, + "sampling/sampling_logp_difference/max": 2.92758526802063, + "sampling/sampling_logp_difference/mean": 0.013116902112960816, + "step": 1410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1446.6, + "completions/max_terminated_length": 1446.6, + "completions/mean_length": 1022.946875, + "completions/mean_terminated_length": 1022.946875, + "completions/min_length": 763.4, + "completions/min_terminated_length": 763.4, + "entropy": 0.27659066915512087, + "epoch": 1.662749706227967, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.9148181080818176, + "learning_rate": 3.4419675308941116e-07, + "loss": 0.0005, + "num_tokens": 188676937.0, + "reward": 0.8335937619209289, + "reward_std": 0.1323336124420166, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8335937619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22035141587257384, + "sampling/importance_sampling_ratio/max": 1.8846810340881348, + "sampling/importance_sampling_ratio/mean": 0.9998690485954285, + "sampling/importance_sampling_ratio/min": 0.3646425485610962, + "sampling/sampling_logp_difference/max": 1.169146227836609, + "sampling/sampling_logp_difference/mean": 0.013671478442847728, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1598.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 1139.496875, + "completions/mean_terminated_length": 1139.496875, + "completions/min_length": 878.8, + "completions/min_terminated_length": 878.8, + "entropy": 0.2779924929141998, + "epoch": 1.6686251468860165, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.4461043179035187, + "learning_rate": 3.4359098618851465e-07, + "loss": 0.0037, + "num_tokens": 189358488.0, + "reward": 0.6711979269981384, + "reward_std": 0.13433899730443954, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6711979269981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3418775200843811, + "sampling/importance_sampling_ratio/max": 1.8875983238220215, + "sampling/importance_sampling_ratio/mean": 1.000086212158203, + "sampling/importance_sampling_ratio/min": 0.3236319288611412, + "sampling/sampling_logp_difference/max": 1.3859389901161194, + "sampling/sampling_logp_difference/mean": 0.013647865317761899, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1635.2, + "completions/max_terminated_length": 1633.4, + "completions/mean_length": 1141.5625, + "completions/mean_terminated_length": 1133.90361328125, + "completions/min_length": 852.8, + "completions/min_terminated_length": 852.8, + "entropy": 0.25371613204479215, + "epoch": 1.674500587544066, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.48061123490333557, + "learning_rate": 3.429852192876181e-07, + "loss": -0.0139, + "num_tokens": 190017716.0, + "reward": 0.85546875, + "reward_std": 0.12050826102495193, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.85546875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23054370284080505, + "sampling/importance_sampling_ratio/max": 1.9753512620925904, + "sampling/importance_sampling_ratio/mean": 1.0000061392784119, + "sampling/importance_sampling_ratio/min": 0.33579882979393005, + "sampling/sampling_logp_difference/max": 1.2614384889602661, + "sampling/sampling_logp_difference/mean": 0.012662022560834884, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2010.0, + "completions/max_terminated_length": 1909.8, + "completions/mean_length": 1150.165625, + "completions/mean_terminated_length": 1141.9548828125, + "completions/min_length": 837.6, + "completions/min_terminated_length": 837.6, + "entropy": 0.2775812327861786, + "epoch": 1.6803760282021152, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7772521376609802, + "learning_rate": 3.4237945238672157e-07, + "loss": -0.0203, + "num_tokens": 190713313.0, + "reward": 0.7494791746139526, + "reward_std": 0.17326337993144988, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7494791865348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3318782389163971, + "sampling/importance_sampling_ratio/max": 1.9999818563461305, + "sampling/importance_sampling_ratio/mean": 1.0000026941299438, + "sampling/importance_sampling_ratio/min": 0.38341291844844816, + "sampling/sampling_logp_difference/max": 1.1000696659088134, + "sampling/sampling_logp_difference/mean": 0.013829389959573746, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.8, + "completions/max_terminated_length": 1708.8, + "completions/mean_length": 1123.74375, + "completions/mean_terminated_length": 1123.74375, + "completions/min_length": 738.0, + "completions/min_terminated_length": 738.0, + "entropy": 0.2673244297504425, + "epoch": 1.6862514688601644, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6763787865638733, + "learning_rate": 3.41773685485825e-07, + "loss": -0.0027, + "num_tokens": 191396479.0, + "reward": 0.6844791769981384, + "reward_std": 0.14983025342226028, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6844791769981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3457943320274353, + "sampling/importance_sampling_ratio/max": 1.9644380331039428, + "sampling/importance_sampling_ratio/mean": 1.0000961780548097, + "sampling/importance_sampling_ratio/min": 0.3317388445138931, + "sampling/sampling_logp_difference/max": 1.1634628534317017, + "sampling/sampling_logp_difference/mean": 0.01333068311214447, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1594.6, + "completions/max_terminated_length": 1594.6, + "completions/mean_length": 1104.1125, + "completions/mean_terminated_length": 1104.1125, + "completions/min_length": 752.0, + "completions/min_terminated_length": 752.0, + "entropy": 0.2846644163131714, + "epoch": 1.6921269095182139, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.842131495475769, + "learning_rate": 3.411679185849285e-07, + "loss": 0.0031, + "num_tokens": 192105251.0, + "reward": 0.79411461353302, + "reward_std": 0.18594035059213637, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.79411461353302, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29445070028305054, + "sampling/importance_sampling_ratio/max": 1.9737318515777589, + "sampling/importance_sampling_ratio/mean": 1.000055193901062, + "sampling/importance_sampling_ratio/min": 0.3524580836296082, + "sampling/sampling_logp_difference/max": 1.214442205429077, + "sampling/sampling_logp_difference/mean": 0.01409766599535942, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1492.6, + "completions/max_terminated_length": 1492.6, + "completions/mean_length": 1088.29375, + "completions/mean_terminated_length": 1088.29375, + "completions/min_length": 832.8, + "completions/min_terminated_length": 832.8, + "entropy": 0.24627983570098877, + "epoch": 1.6980023501762633, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5946608781814575, + "learning_rate": 3.40562151684032e-07, + "loss": 0.0004, + "num_tokens": 192769057.0, + "reward": 0.8041666984558106, + "reward_std": 0.13151083439588546, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8041666984558106, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26574690639972687, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000511169433595, + "sampling/importance_sampling_ratio/min": 0.3163315311074257, + "sampling/sampling_logp_difference/max": 1.519883394241333, + "sampling/sampling_logp_difference/mean": 0.012410031445324422, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1407.6, + "completions/max_terminated_length": 1407.6, + "completions/mean_length": 1042.9375, + "completions/mean_terminated_length": 1042.9375, + "completions/min_length": 769.2, + "completions/min_terminated_length": 769.2, + "entropy": 0.2711980938911438, + "epoch": 1.7038777908343126, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.9581571817398071, + "learning_rate": 3.399563847831354e-07, + "loss": 0.0065, + "num_tokens": 193391677.0, + "reward": 0.8053646087646484, + "reward_std": 0.10521206557750702, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8053646087646484, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24722022712230682, + "sampling/importance_sampling_ratio/max": 1.9165439844131469, + "sampling/importance_sampling_ratio/mean": 0.9998835325241089, + "sampling/importance_sampling_ratio/min": 0.35185267627239225, + "sampling/sampling_logp_difference/max": 1.135908579826355, + "sampling/sampling_logp_difference/mean": 0.013463702611625194, + "step": 1450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1591.8, + "completions/max_terminated_length": 1591.8, + "completions/mean_length": 1123.79375, + "completions/mean_terminated_length": 1123.79375, + "completions/min_length": 847.2, + "completions/min_terminated_length": 847.2, + "entropy": 0.27680361866950987, + "epoch": 1.7097532314923618, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.9067860841751099, + "learning_rate": 3.393506178822389e-07, + "loss": -0.0007, + "num_tokens": 194075291.0, + "reward": 0.7929687738418579, + "reward_std": 0.1173098023980856, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7929687738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27822592854499817, + "sampling/importance_sampling_ratio/max": 1.997536540031433, + "sampling/importance_sampling_ratio/mean": 0.9999045729637146, + "sampling/importance_sampling_ratio/min": 0.24495663307607174, + "sampling/sampling_logp_difference/max": 1.8837570667266845, + "sampling/sampling_logp_difference/mean": 0.013546660356223584, + "step": 1455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1802.6, + "completions/max_terminated_length": 1802.6, + "completions/mean_length": 1195.10625, + "completions/mean_terminated_length": 1195.10625, + "completions/min_length": 867.6, + "completions/min_terminated_length": 867.6, + "entropy": 0.2939670443534851, + "epoch": 1.7156286721504113, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8876379728317261, + "learning_rate": 3.3874485098134235e-07, + "loss": 0.0023, + "num_tokens": 194772253.0, + "reward": 0.8272916793823242, + "reward_std": 0.1560825377702713, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8272916913032532, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2652295768260956, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000036716461183, + "sampling/importance_sampling_ratio/min": 0.34000002443790434, + "sampling/sampling_logp_difference/max": 1.2622709274291992, + "sampling/sampling_logp_difference/mean": 0.014317047223448754, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1873.4, + "completions/max_terminated_length": 1588.2, + "completions/mean_length": 1152.790625, + "completions/mean_terminated_length": 1144.909619140625, + "completions/min_length": 763.8, + "completions/min_terminated_length": 763.8, + "entropy": 0.2873714804649353, + "epoch": 1.7215041128084607, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.39559435844421387, + "learning_rate": 3.3813908408044584e-07, + "loss": -0.0288, + "num_tokens": 195478178.0, + "reward": 0.8166666984558105, + "reward_std": 0.11755480468273163, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8166666984558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23863519877195358, + "sampling/importance_sampling_ratio/max": 1.9812901973724366, + "sampling/importance_sampling_ratio/mean": 0.9998369455337525, + "sampling/importance_sampling_ratio/min": 0.3511409223079681, + "sampling/sampling_logp_difference/max": 1.1258719205856322, + "sampling/sampling_logp_difference/mean": 0.014134268276393414, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1786.4, + "completions/max_terminated_length": 1709.2, + "completions/mean_length": 1069.403125, + "completions/mean_terminated_length": 1060.7073486328125, + "completions/min_length": 754.4, + "completions/min_terminated_length": 754.4, + "entropy": 0.26078082621097565, + "epoch": 1.72737955346651, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7395367622375488, + "learning_rate": 3.3753331717954933e-07, + "loss": -0.0151, + "num_tokens": 196111723.0, + "reward": 0.8067708373069763, + "reward_std": 0.09850462116301059, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8067708373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2902964472770691, + "sampling/importance_sampling_ratio/max": 1.9457304000854492, + "sampling/importance_sampling_ratio/mean": 0.9999278068542481, + "sampling/importance_sampling_ratio/min": 0.3522989869117737, + "sampling/sampling_logp_difference/max": 1.0490608930587768, + "sampling/sampling_logp_difference/mean": 0.013238179869949818, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1820.2, + "completions/max_terminated_length": 1820.2, + "completions/mean_length": 1133.53125, + "completions/mean_terminated_length": 1133.53125, + "completions/min_length": 744.6, + "completions/min_terminated_length": 744.6, + "entropy": 0.2913938283920288, + "epoch": 1.7332549941245592, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.725741446018219, + "learning_rate": 3.3692755027865276e-07, + "loss": -0.0034, + "num_tokens": 196785365.0, + "reward": 0.8822916865348815, + "reward_std": 0.13487583696842192, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8822916865348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22184424996376037, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001361846923829, + "sampling/importance_sampling_ratio/min": 0.23281437605627903, + "sampling/sampling_logp_difference/max": 6.875998139381409, + "sampling/sampling_logp_difference/mean": 0.014337152801454067, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1587.2, + "completions/max_terminated_length": 1587.2, + "completions/mean_length": 1084.86875, + "completions/mean_terminated_length": 1084.86875, + "completions/min_length": 740.6, + "completions/min_terminated_length": 740.6, + "entropy": 0.2729551553726196, + "epoch": 1.7391304347826086, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.8482347726821899, + "learning_rate": 3.3632178337775625e-07, + "loss": 0.0076, + "num_tokens": 197459147.0, + "reward": 0.7274479329586029, + "reward_std": 0.0780523905530572, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7274479389190673, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2432739406824112, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000293016433717, + "sampling/importance_sampling_ratio/min": 0.37133134007453916, + "sampling/sampling_logp_difference/max": 1.160807228088379, + "sampling/sampling_logp_difference/mean": 0.01364643257111311, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1518.4, + "completions/max_terminated_length": 1518.4, + "completions/mean_length": 1082.0125, + "completions/mean_terminated_length": 1082.0125, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "entropy": 0.2662626028060913, + "epoch": 1.745005875440658, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.763447642326355, + "learning_rate": 3.357160164768597e-07, + "loss": 0.0033, + "num_tokens": 198135823.0, + "reward": 0.8534896016120911, + "reward_std": 0.11715929210186005, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8534896016120911, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20295286029577256, + "sampling/importance_sampling_ratio/max": 1.937989926338196, + "sampling/importance_sampling_ratio/mean": 0.9998504519462585, + "sampling/importance_sampling_ratio/min": 0.3984066128730774, + "sampling/sampling_logp_difference/max": 1.212566328048706, + "sampling/sampling_logp_difference/mean": 0.01346975788474083, + "step": 1485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1595.2, + "completions/max_terminated_length": 1595.2, + "completions/mean_length": 1146.109375, + "completions/mean_terminated_length": 1146.109375, + "completions/min_length": 805.4, + "completions/min_terminated_length": 805.4, + "entropy": 0.2847128093242645, + "epoch": 1.7508813160987073, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5143160223960876, + "learning_rate": 3.351102495759632e-07, + "loss": 0.0047, + "num_tokens": 198828690.0, + "reward": 0.6278125166893005, + "reward_std": 0.0884520411491394, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6278125166893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3630589783191681, + "sampling/importance_sampling_ratio/max": 1.9360803604125976, + "sampling/importance_sampling_ratio/mean": 1.000067901611328, + "sampling/importance_sampling_ratio/min": 0.2725113719701767, + "sampling/sampling_logp_difference/max": 1.3496559381484985, + "sampling/sampling_logp_difference/mean": 0.014137699641287327, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1509.8, + "completions/max_terminated_length": 1509.8, + "completions/mean_length": 1099.55, + "completions/mean_terminated_length": 1099.55, + "completions/min_length": 817.4, + "completions/min_terminated_length": 817.4, + "entropy": 0.2684398263692856, + "epoch": 1.7567567567567568, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6384820938110352, + "learning_rate": 3.3450448267506667e-07, + "loss": -0.003, + "num_tokens": 199510882.0, + "reward": 0.8046875119209289, + "reward_std": 0.08579807132482528, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8046875119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23521889597177506, + "sampling/importance_sampling_ratio/max": 1.968173861503601, + "sampling/importance_sampling_ratio/mean": 0.9999576926231384, + "sampling/importance_sampling_ratio/min": 0.41731377840042116, + "sampling/sampling_logp_difference/max": 1.2025867462158204, + "sampling/sampling_logp_difference/mean": 0.013431616872549058, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1570.4, + "completions/max_terminated_length": 1570.4, + "completions/mean_length": 1124.840625, + "completions/mean_terminated_length": 1124.840625, + "completions/min_length": 745.4, + "completions/min_terminated_length": 745.4, + "entropy": 0.2712507307529449, + "epoch": 1.7626321974148063, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7118411064147949, + "learning_rate": 3.3389871577417005e-07, + "loss": 0.0033, + "num_tokens": 200198207.0, + "reward": 0.8050000071525574, + "reward_std": 0.12036772668361664, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8050000071525574, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27572363168001174, + "sampling/importance_sampling_ratio/max": 1.9563879728317262, + "sampling/importance_sampling_ratio/mean": 1.0000667214393615, + "sampling/importance_sampling_ratio/min": 0.44692354202270507, + "sampling/sampling_logp_difference/max": 0.8758127689361572, + "sampling/sampling_logp_difference/mean": 0.013625680841505528, + "step": 1500 + }, + { + "epoch": 1.7626321974148063, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1536.84, + "eval_completions/max_terminated_length": 1536.84, + "eval_completions/mean_length": 1071.7275, + "eval_completions/mean_terminated_length": 1071.7275, + "eval_completions/min_length": 773.0, + "eval_completions/min_terminated_length": 773.0, + "eval_entropy": 0.27730977356433867, + "eval_frac_reward_zero_std": 0.58, + "eval_loss": 0.0037606186233460903, + "eval_num_tokens": 200198207.0, + "eval_reward": 0.7363020944595337, + "eval_reward_std": 0.09190770551562309, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7363020944595337, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3109273624420166, + "eval_runtime": 415.8157, + "eval_samples_per_second": 0.24, + "eval_sampling/importance_sampling_ratio/max": 1.9509496116638183, + "eval_sampling/importance_sampling_ratio/mean": 0.9999890184402466, + "eval_sampling/importance_sampling_ratio/min": 0.34508894979953764, + "eval_sampling/sampling_logp_difference/max": 1.3034389925003051, + "eval_sampling/sampling_logp_difference/mean": 0.013880596235394478, + "eval_steps_per_second": 0.005, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1645.2, + "completions/max_terminated_length": 1645.2, + "completions/mean_length": 1086.928125, + "completions/mean_terminated_length": 1086.928125, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "entropy": 0.2728815793991089, + "epoch": 1.7685076380728555, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8437840938568115, + "learning_rate": 3.3329294887327354e-07, + "loss": 0.0099, + "num_tokens": 200872440.0, + "reward": 0.8072916805744171, + "reward_std": 0.12046882957220077, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8072916805744171, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2373180866241455, + "sampling/importance_sampling_ratio/max": 1.99754478931427, + "sampling/importance_sampling_ratio/mean": 1.0000806212425233, + "sampling/importance_sampling_ratio/min": 0.26217866539955137, + "sampling/sampling_logp_difference/max": 1.6371884107589723, + "sampling/sampling_logp_difference/mean": 0.013733114674687385, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1595.6, + "completions/max_terminated_length": 1595.6, + "completions/mean_length": 1065.984375, + "completions/mean_terminated_length": 1065.984375, + "completions/min_length": 792.4, + "completions/min_terminated_length": 792.4, + "entropy": 0.26055130958557127, + "epoch": 1.7743830787309047, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0, + "learning_rate": 3.32687181972377e-07, + "loss": 0.0001, + "num_tokens": 201514851.0, + "reward": 0.7830729246139526, + "reward_std": 0.0687948226928711, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7830729365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2838017612695694, + "sampling/importance_sampling_ratio/max": 1.9749235153198241, + "sampling/importance_sampling_ratio/mean": 1.0000560760498047, + "sampling/importance_sampling_ratio/min": 0.297700959444046, + "sampling/sampling_logp_difference/max": 1.359819483757019, + "sampling/sampling_logp_difference/mean": 0.01325578261166811, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 1057.603125, + "completions/mean_terminated_length": 1057.603125, + "completions/min_length": 773.6, + "completions/min_terminated_length": 773.6, + "entropy": 0.255610191822052, + "epoch": 1.7802585193889542, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.430332213640213, + "learning_rate": 3.3208141507148047e-07, + "loss": 0.0011, + "num_tokens": 202164196.0, + "reward": 0.7718750238418579, + "reward_std": 0.16177449077367784, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7718750238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29015558063983915, + "sampling/importance_sampling_ratio/max": 1.9674141883850098, + "sampling/importance_sampling_ratio/mean": 0.9999878883361817, + "sampling/importance_sampling_ratio/min": 0.3030325770378113, + "sampling/sampling_logp_difference/max": 1.3088083505630492, + "sampling/sampling_logp_difference/mean": 0.01293297652155161, + "step": 1515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1647.6, + "completions/max_terminated_length": 1647.6, + "completions/mean_length": 1092.4125, + "completions/mean_terminated_length": 1092.4125, + "completions/min_length": 760.8, + "completions/min_terminated_length": 760.8, + "entropy": 0.2662869393825531, + "epoch": 1.7861339600470036, + "frac_reward_zero_std": 0.45, + "grad_norm": 2.165395736694336, + "learning_rate": 3.3147564817058396e-07, + "loss": -0.0056, + "num_tokens": 202817144.0, + "reward": 0.6911458492279052, + "reward_std": 0.11217592209577561, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6911458492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3047403275966644, + "sampling/importance_sampling_ratio/max": 1.9844163179397583, + "sampling/importance_sampling_ratio/mean": 1.0000060439109801, + "sampling/importance_sampling_ratio/min": 0.40356804728507994, + "sampling/sampling_logp_difference/max": 0.9722619771957397, + "sampling/sampling_logp_difference/mean": 0.013397721946239472, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 1852.6, + "completions/max_terminated_length": 1848.6, + "completions/mean_length": 1120.71875, + "completions/mean_terminated_length": 1104.1087646484375, + "completions/min_length": 716.8, + "completions/min_terminated_length": 716.8, + "entropy": 0.27589981257915497, + "epoch": 1.7920094007050529, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.6116582155227661, + "learning_rate": 3.308698812696874e-07, + "loss": -0.0213, + "num_tokens": 203493166.0, + "reward": 0.6966145992279053, + "reward_std": 0.13366687893867493, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6966145992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32275949120521547, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000032782554626, + "sampling/importance_sampling_ratio/min": 0.3299687564373016, + "sampling/sampling_logp_difference/max": 1.2879135131835937, + "sampling/sampling_logp_difference/mean": 0.013806664571166038, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1564.8, + "completions/max_terminated_length": 1564.8, + "completions/mean_length": 1040.4125, + "completions/mean_terminated_length": 1040.4125, + "completions/min_length": 718.4, + "completions/min_terminated_length": 718.4, + "entropy": 0.25432821810245515, + "epoch": 1.7978848413631021, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.621759831905365, + "learning_rate": 3.302641143687909e-07, + "loss": -0.0092, + "num_tokens": 204126866.0, + "reward": 0.806458342075348, + "reward_std": 0.11663768589496612, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.806458342075348, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2767653793096542, + "sampling/importance_sampling_ratio/max": 1.8235158443450927, + "sampling/importance_sampling_ratio/mean": 0.9999896287918091, + "sampling/importance_sampling_ratio/min": 0.4039942383766174, + "sampling/sampling_logp_difference/max": 0.9295182943344116, + "sampling/sampling_logp_difference/mean": 0.012854778952896596, + "step": 1530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1861.2, + "completions/max_terminated_length": 1861.2, + "completions/mean_length": 1086.84375, + "completions/mean_terminated_length": 1086.84375, + "completions/min_length": 759.8, + "completions/min_terminated_length": 759.8, + "entropy": 0.25863939225673677, + "epoch": 1.8037602820211516, + "frac_reward_zero_std": 0.3, + "grad_norm": 1.2077988386154175, + "learning_rate": 3.296583474678943e-07, + "loss": -0.0134, + "num_tokens": 204804720.0, + "reward": 0.7979166746139527, + "reward_std": 0.17481858432292938, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7979166865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28416181802749635, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999544143676757, + "sampling/importance_sampling_ratio/min": 0.38569867610931396, + "sampling/sampling_logp_difference/max": 0.9860574126243591, + "sampling/sampling_logp_difference/mean": 0.013360159657895564, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 1092.853125, + "completions/mean_terminated_length": 1092.853125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.262846040725708, + "epoch": 1.809635722679201, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6944238543510437, + "learning_rate": 3.290525805669978e-07, + "loss": 0.0018, + "num_tokens": 205475425.0, + "reward": 0.7393229246139527, + "reward_std": 0.08657962083816528, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7393229246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2897666200995445, + "sampling/importance_sampling_ratio/max": 1.9605785608291626, + "sampling/importance_sampling_ratio/mean": 1.0000004768371582, + "sampling/importance_sampling_ratio/min": 0.39530388712882997, + "sampling/sampling_logp_difference/max": 0.9769327878952027, + "sampling/sampling_logp_difference/mean": 0.013001962006092072, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1548.0, + "completions/max_terminated_length": 1548.0, + "completions/mean_length": 1128.890625, + "completions/mean_terminated_length": 1128.890625, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.27651565670967104, + "epoch": 1.8155111633372503, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8144307732582092, + "learning_rate": 3.284468136661013e-07, + "loss": 0.0076, + "num_tokens": 206159118.0, + "reward": 0.7991666793823242, + "reward_std": 0.08892802894115448, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7991666793823242, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2119009464979172, + "sampling/importance_sampling_ratio/max": 1.920351767539978, + "sampling/importance_sampling_ratio/mean": 1.0000345706939697, + "sampling/importance_sampling_ratio/min": 0.35048373639583585, + "sampling/sampling_logp_difference/max": 1.0617512702941894, + "sampling/sampling_logp_difference/mean": 0.013503380306065082, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1675.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 1168.159375, + "completions/mean_terminated_length": 1168.159375, + "completions/min_length": 803.2, + "completions/min_terminated_length": 803.2, + "entropy": 0.2956337988376617, + "epoch": 1.8213866039952995, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5562376976013184, + "learning_rate": 3.2784104676520473e-07, + "loss": -0.0029, + "num_tokens": 206846977.0, + "reward": 0.7964062571525574, + "reward_std": 0.07668278813362121, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7964062690734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3294161468744278, + "sampling/importance_sampling_ratio/max": 1.8290458917617798, + "sampling/importance_sampling_ratio/mean": 1.000098967552185, + "sampling/importance_sampling_ratio/min": 0.3671499669551849, + "sampling/sampling_logp_difference/max": 1.039049458503723, + "sampling/sampling_logp_difference/mean": 0.014282687194645404, + "step": 1550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1794.4, + "completions/max_terminated_length": 1794.4, + "completions/mean_length": 1118.421875, + "completions/mean_terminated_length": 1118.421875, + "completions/min_length": 783.4, + "completions/min_terminated_length": 783.4, + "entropy": 0.2976920962333679, + "epoch": 1.827262044653349, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7856850624084473, + "learning_rate": 3.272352798643082e-07, + "loss": 0.0094, + "num_tokens": 207543032.0, + "reward": 0.651562511920929, + "reward_std": 0.12044147849082946, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.651562511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36984447836875917, + "sampling/importance_sampling_ratio/max": 1.9146425247192382, + "sampling/importance_sampling_ratio/mean": 0.9998530745506287, + "sampling/importance_sampling_ratio/min": 0.3384554922580719, + "sampling/sampling_logp_difference/max": 1.096525502204895, + "sampling/sampling_logp_difference/mean": 0.014470845647156238, + "step": 1555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1517.4, + "completions/max_terminated_length": 1517.4, + "completions/mean_length": 1085.29375, + "completions/mean_terminated_length": 1085.29375, + "completions/min_length": 749.2, + "completions/min_terminated_length": 749.2, + "entropy": 0.27524412870407106, + "epoch": 1.8331374853113984, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7174109816551208, + "learning_rate": 3.2662951296341166e-07, + "loss": 0.0095, + "num_tokens": 208243862.0, + "reward": 0.7854166746139526, + "reward_std": 0.11581210866570472, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7854166746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30429266691207885, + "sampling/importance_sampling_ratio/max": 1.9975382328033446, + "sampling/importance_sampling_ratio/mean": 0.9999500274658203, + "sampling/importance_sampling_ratio/min": 0.3587868869304657, + "sampling/sampling_logp_difference/max": 1.1987864136695863, + "sampling/sampling_logp_difference/mean": 0.013580608554184437, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1499.4, + "completions/max_terminated_length": 1499.4, + "completions/mean_length": 1119.565625, + "completions/mean_terminated_length": 1119.565625, + "completions/min_length": 858.6, + "completions/min_terminated_length": 858.6, + "entropy": 0.2701830804347992, + "epoch": 1.8390129259694477, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8073204159736633, + "learning_rate": 3.2602374606251515e-07, + "loss": 0.0011, + "num_tokens": 208959259.0, + "reward": 0.8133854150772095, + "reward_std": 0.11158336699008942, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8133854150772095, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2828989580273628, + "sampling/importance_sampling_ratio/max": 1.943230438232422, + "sampling/importance_sampling_ratio/mean": 0.999999463558197, + "sampling/importance_sampling_ratio/min": 0.2029788501560688, + "sampling/sampling_logp_difference/max": 1.7986388444900512, + "sampling/sampling_logp_difference/mean": 0.013567885570228099, + "step": 1565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1445.0, + "completions/max_terminated_length": 1445.0, + "completions/mean_length": 1044.534375, + "completions/mean_terminated_length": 1044.534375, + "completions/min_length": 792.6, + "completions/min_terminated_length": 792.6, + "entropy": 0.27079087495803833, + "epoch": 1.8448883666274971, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0, + "learning_rate": 3.2541797916161864e-07, + "loss": 0.0025, + "num_tokens": 209613382.0, + "reward": 0.810937511920929, + "reward_std": 0.06994951367378235, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.810937511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.244219571352005, + "sampling/importance_sampling_ratio/max": 1.8565600395202637, + "sampling/importance_sampling_ratio/mean": 1.0000513553619386, + "sampling/importance_sampling_ratio/min": 0.34990236461162566, + "sampling/sampling_logp_difference/max": 1.088843870162964, + "sampling/sampling_logp_difference/mean": 0.013324829936027526, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1507.4, + "completions/max_terminated_length": 1507.4, + "completions/mean_length": 1063.225, + "completions/mean_terminated_length": 1063.225, + "completions/min_length": 785.8, + "completions/min_terminated_length": 785.8, + "entropy": 0.26387048363685606, + "epoch": 1.8507638072855466, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.773414671421051, + "learning_rate": 3.2481221226072207e-07, + "loss": -0.0025, + "num_tokens": 210242766.0, + "reward": 0.934375, + "reward_std": 0.09152774214744568, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.934375, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13311923742294313, + "sampling/importance_sampling_ratio/max": 1.9508730173110962, + "sampling/importance_sampling_ratio/mean": 0.9999862909317017, + "sampling/importance_sampling_ratio/min": 0.3466633170843124, + "sampling/sampling_logp_difference/max": 1.1064417600631713, + "sampling/sampling_logp_difference/mean": 0.01319211684167385, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1640.4, + "completions/max_terminated_length": 1640.4, + "completions/mean_length": 1128.60625, + "completions/mean_terminated_length": 1128.60625, + "completions/min_length": 818.8, + "completions/min_terminated_length": 818.8, + "entropy": 0.2660370707511902, + "epoch": 1.8566392479435958, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.6501742005348206, + "learning_rate": 3.242064453598255e-07, + "loss": 0.0035, + "num_tokens": 210934624.0, + "reward": 0.6548958420753479, + "reward_std": 0.12596461772918702, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6548958420753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2918025851249695, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999019145965576, + "sampling/importance_sampling_ratio/min": 0.2749210774898529, + "sampling/sampling_logp_difference/max": 1.399231457710266, + "sampling/sampling_logp_difference/mean": 0.01334780901670456, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1491.4, + "completions/max_terminated_length": 1491.4, + "completions/mean_length": 1057.3125, + "completions/mean_terminated_length": 1057.3125, + "completions/min_length": 761.6, + "completions/min_terminated_length": 761.6, + "entropy": 0.262117275595665, + "epoch": 1.862514688601645, + "frac_reward_zero_std": 0.45, + "grad_norm": 2.2436435222625732, + "learning_rate": 3.2360067845892895e-07, + "loss": 0.0087, + "num_tokens": 211605716.0, + "reward": 0.7901041805744171, + "reward_std": 0.10960776507854461, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7901041805744171, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2129432439804077, + "sampling/importance_sampling_ratio/max": 1.9343363523483277, + "sampling/importance_sampling_ratio/mean": 0.9999404549598694, + "sampling/importance_sampling_ratio/min": 0.35822451710700987, + "sampling/sampling_logp_difference/max": 1.0487784624099732, + "sampling/sampling_logp_difference/mean": 0.013386520184576511, + "step": 1585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.0, + "completions/max_terminated_length": 1622.0, + "completions/mean_length": 1065.365625, + "completions/mean_terminated_length": 1065.365625, + "completions/min_length": 718.6, + "completions/min_terminated_length": 718.6, + "entropy": 0.27746407985687255, + "epoch": 1.8683901292596945, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5856614112854004, + "learning_rate": 3.2299491155803244e-07, + "loss": 0.004, + "num_tokens": 212305273.0, + "reward": 0.7411458492279053, + "reward_std": 0.084406515955925, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7411458492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2793776974081993, + "sampling/importance_sampling_ratio/max": 1.961573100090027, + "sampling/importance_sampling_ratio/mean": 0.999928104877472, + "sampling/importance_sampling_ratio/min": 0.40686691403388975, + "sampling/sampling_logp_difference/max": 1.0171378612518311, + "sampling/sampling_logp_difference/mean": 0.013991770520806312, + "step": 1590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1656.6, + "completions/max_terminated_length": 1656.6, + "completions/mean_length": 1115.678125, + "completions/mean_terminated_length": 1115.678125, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "entropy": 0.24829670786857605, + "epoch": 1.874265569917744, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8266233205795288, + "learning_rate": 3.223891446571359e-07, + "loss": 0.0043, + "num_tokens": 212961378.0, + "reward": 0.8059895873069763, + "reward_std": 0.10123835355043412, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8059895873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24552496075630187, + "sampling/importance_sampling_ratio/max": 1.7967779159545898, + "sampling/importance_sampling_ratio/mean": 0.9999753475189209, + "sampling/importance_sampling_ratio/min": 0.30345211625099183, + "sampling/sampling_logp_difference/max": 1.3853749752044677, + "sampling/sampling_logp_difference/mean": 0.012761880829930305, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1525.8, + "completions/max_terminated_length": 1525.8, + "completions/mean_length": 1050.45, + "completions/mean_terminated_length": 1050.45, + "completions/min_length": 756.4, + "completions/min_terminated_length": 756.4, + "entropy": 0.25351338386535643, + "epoch": 1.8801410105757932, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5988499522209167, + "learning_rate": 3.2178337775623936e-07, + "loss": 0.0034, + "num_tokens": 213594482.0, + "reward": 0.9263020992279053, + "reward_std": 0.06761249005794526, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9263020992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.11542957425117492, + "sampling/importance_sampling_ratio/max": 1.9589691400527953, + "sampling/importance_sampling_ratio/mean": 1.0000545501708984, + "sampling/importance_sampling_ratio/min": 0.34012679755687714, + "sampling/sampling_logp_difference/max": 1.2352878332138062, + "sampling/sampling_logp_difference/mean": 0.012775789014995098, + "step": 1600 + }, + { + "epoch": 1.8801410105757932, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1435.84, + "eval_completions/max_terminated_length": 1435.84, + "eval_completions/mean_length": 1044.866875, + "eval_completions/mean_terminated_length": 1044.866875, + "eval_completions/min_length": 772.36, + "eval_completions/min_terminated_length": 772.36, + "eval_entropy": 0.2674348741769791, + "eval_frac_reward_zero_std": 0.53, + "eval_loss": -0.001374118379317224, + "eval_num_tokens": 213594482.0, + "eval_reward": 0.7270000171661377, + "eval_reward_std": 0.09518681436777116, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7270000171661377, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31534139573574066, + "eval_runtime": 396.0164, + "eval_samples_per_second": 0.253, + "eval_sampling/importance_sampling_ratio/max": 1.9521161270141603, + "eval_sampling/importance_sampling_ratio/mean": 0.9999828863143921, + "eval_sampling/importance_sampling_ratio/min": 0.37689221899025144, + "eval_sampling/sampling_logp_difference/max": 1.2735300946235657, + "eval_sampling/sampling_logp_difference/mean": 0.013533079214394093, + "eval_steps_per_second": 0.005, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1400.6, + "completions/max_terminated_length": 1400.6, + "completions/mean_length": 1069.303125, + "completions/mean_terminated_length": 1069.303125, + "completions/min_length": 842.2, + "completions/min_terminated_length": 842.2, + "entropy": 0.24568533301353454, + "epoch": 1.8860164512338424, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.45789584517478943, + "learning_rate": 3.2117761085534285e-07, + "loss": 0.0016, + "num_tokens": 214247363.0, + "reward": 0.7768229246139526, + "reward_std": 0.08649568557739258, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7768229246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23062911927700042, + "sampling/importance_sampling_ratio/max": 1.925460934638977, + "sampling/importance_sampling_ratio/mean": 1.0000849604606628, + "sampling/importance_sampling_ratio/min": 0.31908697783946993, + "sampling/sampling_logp_difference/max": 1.8175645828247071, + "sampling/sampling_logp_difference/mean": 0.012399931252002717, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1458.8, + "completions/max_terminated_length": 1458.8, + "completions/mean_length": 1065.69375, + "completions/mean_terminated_length": 1065.69375, + "completions/min_length": 838.4, + "completions/min_terminated_length": 838.4, + "entropy": 0.2598737061023712, + "epoch": 1.8918918918918919, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 3.205718439544463e-07, + "loss": 0.0041, + "num_tokens": 214855345.0, + "reward": 0.8633854150772095, + "reward_std": 0.06079368144273758, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8633854150772095, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18320930004119873, + "sampling/importance_sampling_ratio/max": 1.9000773906707764, + "sampling/importance_sampling_ratio/mean": 1.0001107931137085, + "sampling/importance_sampling_ratio/min": 0.425476199388504, + "sampling/sampling_logp_difference/max": 0.9314059376716614, + "sampling/sampling_logp_difference/mean": 0.0131002776324749, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1672.2, + "completions/max_terminated_length": 1612.6, + "completions/mean_length": 1148.871875, + "completions/mean_terminated_length": 1140.6226806640625, + "completions/min_length": 830.2, + "completions/min_terminated_length": 830.2, + "entropy": 0.2641796410083771, + "epoch": 1.8977673325499413, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7945494055747986, + "learning_rate": 3.199660770535498e-07, + "loss": -0.0085, + "num_tokens": 215521120.0, + "reward": 0.7901041746139527, + "reward_std": 0.12795178554952144, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7901041746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2533832848072052, + "sampling/importance_sampling_ratio/max": 1.9268634796142579, + "sampling/importance_sampling_ratio/mean": 1.0000725269317627, + "sampling/importance_sampling_ratio/min": 0.3661342471837997, + "sampling/sampling_logp_difference/max": 1.1095021486282348, + "sampling/sampling_logp_difference/mean": 0.013217655010521412, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1666.8, + "completions/max_terminated_length": 1666.8, + "completions/mean_length": 1044.421875, + "completions/mean_terminated_length": 1044.421875, + "completions/min_length": 700.6, + "completions/min_terminated_length": 700.6, + "entropy": 0.26596803069114683, + "epoch": 1.9036427732079906, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.857772707939148, + "learning_rate": 3.1936031015265327e-07, + "loss": 0.0036, + "num_tokens": 216141047.0, + "reward": 0.8739062666893005, + "reward_std": 0.12695073261857032, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8739062666893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2268179625272751, + "sampling/importance_sampling_ratio/max": 1.8997068881988526, + "sampling/importance_sampling_ratio/mean": 0.9999036669731141, + "sampling/importance_sampling_ratio/min": 0.3743781954050064, + "sampling/sampling_logp_difference/max": 1.1093499660491943, + "sampling/sampling_logp_difference/mean": 0.013530664332211017, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1535.8, + "completions/max_terminated_length": 1535.8, + "completions/mean_length": 1037.45625, + "completions/mean_terminated_length": 1037.45625, + "completions/min_length": 765.4, + "completions/min_terminated_length": 765.4, + "entropy": 0.2603359043598175, + "epoch": 1.9095182138660398, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5619133710861206, + "learning_rate": 3.187545432517567e-07, + "loss": -0.0101, + "num_tokens": 216803305.0, + "reward": 0.9479166865348816, + "reward_std": 0.06181866824626923, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9479166865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13064087331295013, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001649498939513, + "sampling/importance_sampling_ratio/min": 0.3292679309844971, + "sampling/sampling_logp_difference/max": 1.2311968326568603, + "sampling/sampling_logp_difference/mean": 0.013423211500048637, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1532.4, + "completions/max_terminated_length": 1532.4, + "completions/mean_length": 1074.621875, + "completions/mean_terminated_length": 1074.621875, + "completions/min_length": 813.6, + "completions/min_terminated_length": 813.6, + "entropy": 0.2620278686285019, + "epoch": 1.9153936545240893, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7254646420478821, + "learning_rate": 3.181487763508602e-07, + "loss": -0.0052, + "num_tokens": 217482080.0, + "reward": 0.7119271039962769, + "reward_std": 0.11240836530923844, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7119271039962769, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3277711272239685, + "sampling/importance_sampling_ratio/max": 1.9904924154281616, + "sampling/importance_sampling_ratio/mean": 1.0000413656234741, + "sampling/importance_sampling_ratio/min": 0.28767693042755127, + "sampling/sampling_logp_difference/max": 1.4912013769149781, + "sampling/sampling_logp_difference/mean": 0.013511856086552143, + "step": 1630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1490.4, + "completions/max_terminated_length": 1490.4, + "completions/mean_length": 1064.96875, + "completions/mean_terminated_length": 1064.96875, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 0.26446402370929717, + "epoch": 1.9212690951821387, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.6776900291442871, + "learning_rate": 3.1754300944996363e-07, + "loss": 0.0053, + "num_tokens": 218150838.0, + "reward": 0.7807291984558106, + "reward_std": 0.13026501089334488, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7807291984558106, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2612376809120178, + "sampling/importance_sampling_ratio/max": 1.939541721343994, + "sampling/importance_sampling_ratio/mean": 0.9999086499214173, + "sampling/importance_sampling_ratio/min": 0.40064749121665955, + "sampling/sampling_logp_difference/max": 0.9914682865142822, + "sampling/sampling_logp_difference/mean": 0.01353347897529602, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1573.8, + "completions/max_terminated_length": 1573.8, + "completions/mean_length": 1073.8125, + "completions/mean_terminated_length": 1073.8125, + "completions/min_length": 699.4, + "completions/min_terminated_length": 699.4, + "entropy": 0.25780443847179413, + "epoch": 1.927144535840188, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7870806455612183, + "learning_rate": 3.169372425490671e-07, + "loss": 0.0056, + "num_tokens": 218791034.0, + "reward": 0.8082812786102295, + "reward_std": 0.07447053156793118, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8082812786102295, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3176675528287888, + "sampling/importance_sampling_ratio/max": 1.9315118789672852, + "sampling/importance_sampling_ratio/mean": 0.999969232082367, + "sampling/importance_sampling_ratio/min": 0.32335387766361234, + "sampling/sampling_logp_difference/max": 1.1687919616699218, + "sampling/sampling_logp_difference/mean": 0.013280937634408473, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1408.8, + "completions/max_terminated_length": 1408.8, + "completions/mean_length": 1076.375, + "completions/mean_terminated_length": 1076.375, + "completions/min_length": 748.8, + "completions/min_terminated_length": 748.8, + "entropy": 0.27145981788635254, + "epoch": 1.9330199764982372, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6460449695587158, + "learning_rate": 3.163314756481706e-07, + "loss": -0.0059, + "num_tokens": 219458946.0, + "reward": 0.8421354293823242, + "reward_std": 0.09370578080415726, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8421354293823242, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24492353498935698, + "sampling/importance_sampling_ratio/max": 1.9178395748138428, + "sampling/importance_sampling_ratio/mean": 0.9999516487121582, + "sampling/importance_sampling_ratio/min": 0.33379133939743044, + "sampling/sampling_logp_difference/max": 1.2747412323951721, + "sampling/sampling_logp_difference/mean": 0.01361453216522932, + "step": 1645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1519.0, + "completions/max_terminated_length": 1519.0, + "completions/mean_length": 1094.121875, + "completions/mean_terminated_length": 1094.121875, + "completions/min_length": 839.6, + "completions/min_terminated_length": 839.6, + "entropy": 0.2646895945072174, + "epoch": 1.9388954171562869, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7939705848693848, + "learning_rate": 3.1572570874727404e-07, + "loss": -0.0069, + "num_tokens": 220152905.0, + "reward": 0.7620833396911622, + "reward_std": 0.13086363822221755, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7620833396911622, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28319459557533266, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000337362289429, + "sampling/importance_sampling_ratio/min": 0.29829355180263517, + "sampling/sampling_logp_difference/max": 1.419735050201416, + "sampling/sampling_logp_difference/mean": 0.013573858141899108, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1786.4, + "completions/max_terminated_length": 1773.8, + "completions/mean_length": 1110.184375, + "completions/mean_terminated_length": 1106.42705078125, + "completions/min_length": 776.8, + "completions/min_terminated_length": 776.8, + "entropy": 0.2640155255794525, + "epoch": 1.9447708578143361, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.47779330611228943, + "learning_rate": 3.1511994184637753e-07, + "loss": -0.0012, + "num_tokens": 220831344.0, + "reward": 0.7302083492279052, + "reward_std": 0.11683603897690772, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7302083611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.38854368925094607, + "sampling/importance_sampling_ratio/max": 1.9117161750793457, + "sampling/importance_sampling_ratio/mean": 0.9999454617500305, + "sampling/importance_sampling_ratio/min": 0.288709232211113, + "sampling/sampling_logp_difference/max": 1.4568031072616576, + "sampling/sampling_logp_difference/mean": 0.013713906332850457, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1509.4, + "completions/max_terminated_length": 1509.4, + "completions/mean_length": 1074.428125, + "completions/mean_terminated_length": 1074.428125, + "completions/min_length": 821.4, + "completions/min_terminated_length": 821.4, + "entropy": 0.263632670044899, + "epoch": 1.9506462984723854, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5578112602233887, + "learning_rate": 3.1451417494548097e-07, + "loss": 0.0022, + "num_tokens": 221479513.0, + "reward": 0.8171875238418579, + "reward_std": 0.09892857819795609, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8171875238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23545759916305542, + "sampling/importance_sampling_ratio/max": 1.907174038887024, + "sampling/importance_sampling_ratio/mean": 1.000023603439331, + "sampling/importance_sampling_ratio/min": 0.3447980388998985, + "sampling/sampling_logp_difference/max": 1.2826555967330933, + "sampling/sampling_logp_difference/mean": 0.013302036374807358, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1636.0, + "completions/max_terminated_length": 1636.0, + "completions/mean_length": 1165.56875, + "completions/mean_terminated_length": 1165.56875, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "entropy": 0.27500487565994264, + "epoch": 1.9565217391304348, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7453766465187073, + "learning_rate": 3.139084080445844e-07, + "loss": -0.0032, + "num_tokens": 222190751.0, + "reward": 0.6704166889190674, + "reward_std": 0.12143459171056747, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6704166889190674, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3668683707714081, + "sampling/importance_sampling_ratio/max": 1.9511921644210815, + "sampling/importance_sampling_ratio/mean": 1.0000686645507812, + "sampling/importance_sampling_ratio/min": 0.26080631613731386, + "sampling/sampling_logp_difference/max": 1.390922975540161, + "sampling/sampling_logp_difference/mean": 0.01381862722337246, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1588.8, + "completions/max_terminated_length": 1588.8, + "completions/mean_length": 1106.275, + "completions/mean_terminated_length": 1106.275, + "completions/min_length": 848.2, + "completions/min_terminated_length": 848.2, + "entropy": 0.2858578205108643, + "epoch": 1.9623971797884843, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7552258372306824, + "learning_rate": 3.133026411436879e-07, + "loss": 0.0018, + "num_tokens": 222842407.0, + "reward": 0.8156250238418579, + "reward_std": 0.12306551039218902, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8156250238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2660649374127388, + "sampling/importance_sampling_ratio/max": 1.9279718160629273, + "sampling/importance_sampling_ratio/mean": 1.0000049114227294, + "sampling/importance_sampling_ratio/min": 0.18268027827143668, + "sampling/sampling_logp_difference/max": 1.923844289779663, + "sampling/sampling_logp_difference/mean": 0.0143051628023386, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1776.0, + "completions/max_terminated_length": 1776.0, + "completions/mean_length": 1201.125, + "completions/mean_terminated_length": 1201.125, + "completions/min_length": 876.2, + "completions/min_terminated_length": 876.2, + "entropy": 0.2890251398086548, + "epoch": 1.9682726204465335, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6220123171806335, + "learning_rate": 3.1269687424279133e-07, + "loss": -0.0051, + "num_tokens": 223545759.0, + "reward": 0.7908854246139526, + "reward_std": 0.10933632254600525, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7908854365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29638255536556246, + "sampling/importance_sampling_ratio/max": 1.9763683319091796, + "sampling/importance_sampling_ratio/mean": 1.000083565711975, + "sampling/importance_sampling_ratio/min": 0.35314607322216035, + "sampling/sampling_logp_difference/max": 1.0618009567260742, + "sampling/sampling_logp_difference/mean": 0.014300593733787536, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1733.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 1136.7125, + "completions/mean_terminated_length": 1127.995556640625, + "completions/min_length": 782.4, + "completions/min_terminated_length": 782.4, + "entropy": 0.2694635778665543, + "epoch": 1.9741480611045827, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4368976056575775, + "learning_rate": 3.120911073418948e-07, + "loss": -0.0173, + "num_tokens": 224206443.0, + "reward": 0.8182291746139526, + "reward_std": 0.07583726048469544, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8182291746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25061969757080077, + "sampling/importance_sampling_ratio/max": 1.8513876676559449, + "sampling/importance_sampling_ratio/mean": 0.9999567866325378, + "sampling/importance_sampling_ratio/min": 0.2878729492425919, + "sampling/sampling_logp_difference/max": 1.3490334510803224, + "sampling/sampling_logp_difference/mean": 0.013676924258470535, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1541.2, + "completions/max_terminated_length": 1541.2, + "completions/mean_length": 1131.984375, + "completions/mean_terminated_length": 1131.984375, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.2611004739999771, + "epoch": 1.9800235017626322, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5511998534202576, + "learning_rate": 3.1148534044099826e-07, + "loss": -0.0031, + "num_tokens": 224880918.0, + "reward": 0.798645842075348, + "reward_std": 0.08196402341127396, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.798645842075348, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25188693404197693, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000040054321289, + "sampling/importance_sampling_ratio/min": 0.3580232530832291, + "sampling/sampling_logp_difference/max": 1.0668410778045654, + "sampling/sampling_logp_difference/mean": 0.013273421488702298, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1610.0, + "completions/max_terminated_length": 1610.0, + "completions/mean_length": 1153.4875, + "completions/mean_terminated_length": 1153.4875, + "completions/min_length": 816.2, + "completions/min_terminated_length": 816.2, + "entropy": 0.29184606671333313, + "epoch": 1.9858989424206817, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4519566595554352, + "learning_rate": 3.1087957354010175e-07, + "loss": 0.004, + "num_tokens": 225568706.0, + "reward": 0.817187511920929, + "reward_std": 0.0991033136844635, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.817187511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28155410587787627, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999716281890869, + "sampling/importance_sampling_ratio/min": 0.31005223616957667, + "sampling/sampling_logp_difference/max": 1.4509097576141357, + "sampling/sampling_logp_difference/mean": 0.014347866736352443, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1595.0, + "completions/max_terminated_length": 1595.0, + "completions/mean_length": 1157.546875, + "completions/mean_terminated_length": 1157.546875, + "completions/min_length": 833.8, + "completions/min_terminated_length": 833.8, + "entropy": 0.272691935300827, + "epoch": 1.991774383078731, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.48185354471206665, + "learning_rate": 3.1027380663920524e-07, + "loss": 0.0018, + "num_tokens": 226276833.0, + "reward": 0.8002604246139526, + "reward_std": 0.07418519258499146, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8002604365348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31211295127868655, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999703764915466, + "sampling/importance_sampling_ratio/min": 0.24243721812963487, + "sampling/sampling_logp_difference/max": 1.4976640701293946, + "sampling/sampling_logp_difference/mean": 0.013997980579733848, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1685.6, + "completions/max_terminated_length": 1685.6, + "completions/mean_length": 1165.35625, + "completions/mean_terminated_length": 1165.35625, + "completions/min_length": 845.4, + "completions/min_terminated_length": 845.4, + "entropy": 0.2845953702926636, + "epoch": 1.9976498237367801, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.9591985940933228, + "learning_rate": 3.0966803973830867e-07, + "loss": 0.0079, + "num_tokens": 227020035.0, + "reward": 0.6323958516120911, + "reward_std": 0.1353075310587883, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6323958516120911, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3693024396896362, + "sampling/importance_sampling_ratio/max": 1.9484627723693848, + "sampling/importance_sampling_ratio/mean": 0.9999523758888245, + "sampling/importance_sampling_ratio/min": 0.44701356887817384, + "sampling/sampling_logp_difference/max": 1.0415231466293335, + "sampling/sampling_logp_difference/mean": 0.014189911261200904, + "step": 1700 + }, + { + "epoch": 1.9976498237367801, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1661.84, + "eval_completions/max_terminated_length": 1661.84, + "eval_completions/mean_length": 1161.033125, + "eval_completions/mean_terminated_length": 1161.033125, + "eval_completions/min_length": 846.36, + "eval_completions/min_terminated_length": 846.36, + "eval_entropy": 0.2866858780384064, + "eval_frac_reward_zero_std": 0.58, + "eval_loss": 0.004300988744944334, + "eval_num_tokens": 227020035.0, + "eval_reward": 0.7361770963668823, + "eval_reward_std": 0.08369863875210286, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7361770963668823, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.31068264067173, + "eval_runtime": 451.7625, + "eval_samples_per_second": 0.221, + "eval_sampling/importance_sampling_ratio/max": 1.9099996519088744, + "eval_sampling/importance_sampling_ratio/mean": 1.000064604282379, + "eval_sampling/importance_sampling_ratio/min": 0.36046827347949145, + "eval_sampling/sampling_logp_difference/max": 1.241805739402771, + "eval_sampling/sampling_logp_difference/mean": 0.014108292125165463, + "eval_steps_per_second": 0.004, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1656.6, + "completions/max_terminated_length": 1656.6, + "completions/mean_length": 1174.23125, + "completions/mean_terminated_length": 1174.23125, + "completions/min_length": 818.8, + "completions/min_terminated_length": 818.8, + "entropy": 0.2785020112991333, + "epoch": 2.00352526439483, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.871213436126709, + "learning_rate": 3.0906227283741216e-07, + "loss": 0.0072, + "num_tokens": 227740653.0, + "reward": 0.7760416746139527, + "reward_std": 0.10184991657733918, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7760416746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31230961382389066, + "sampling/importance_sampling_ratio/max": 1.9423685312271117, + "sampling/importance_sampling_ratio/mean": 1.0000242710113525, + "sampling/importance_sampling_ratio/min": 0.38754723966121674, + "sampling/sampling_logp_difference/max": 1.279812264442444, + "sampling/sampling_logp_difference/mean": 0.013877030275762082, + "step": 1705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1814.2, + "completions/max_terminated_length": 1814.2, + "completions/mean_length": 1234.190625, + "completions/mean_terminated_length": 1234.190625, + "completions/min_length": 928.4, + "completions/min_terminated_length": 928.4, + "entropy": 0.29070239663124087, + "epoch": 2.009400705052879, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5816195011138916, + "learning_rate": 3.084565059365156e-07, + "loss": 0.0039, + "num_tokens": 228487386.0, + "reward": 0.8880208492279053, + "reward_std": 0.08002980649471284, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8880208492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20378222912549973, + "sampling/importance_sampling_ratio/max": 1.9560601711273193, + "sampling/importance_sampling_ratio/mean": 1.0000455379486084, + "sampling/importance_sampling_ratio/min": 0.26356355398893355, + "sampling/sampling_logp_difference/max": 1.8925941467285157, + "sampling/sampling_logp_difference/mean": 0.01430057343095541, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2000.6, + "completions/max_terminated_length": 1791.6, + "completions/mean_length": 1251.0375, + "completions/mean_terminated_length": 1243.4466552734375, + "completions/min_length": 902.8, + "completions/min_terminated_length": 902.8, + "entropy": 0.28459187150001525, + "epoch": 2.0152761457109283, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5703986883163452, + "learning_rate": 3.078507390356191e-07, + "loss": -0.0065, + "num_tokens": 229218670.0, + "reward": 0.7763020873069764, + "reward_std": 0.10708752572536469, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7763020873069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3236579895019531, + "sampling/importance_sampling_ratio/max": 1.9888316869735718, + "sampling/importance_sampling_ratio/mean": 1.000116801261902, + "sampling/importance_sampling_ratio/min": 0.32361292839050293, + "sampling/sampling_logp_difference/max": 1.2249764919281005, + "sampling/sampling_logp_difference/mean": 0.014165903627872466, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1804.8, + "completions/max_terminated_length": 1804.8, + "completions/mean_length": 1247.0375, + "completions/mean_terminated_length": 1247.0375, + "completions/min_length": 867.8, + "completions/min_terminated_length": 867.8, + "entropy": 0.29395602345466615, + "epoch": 2.0211515863689775, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7805261611938477, + "learning_rate": 3.072449721347226e-07, + "loss": -0.0015, + "num_tokens": 229942954.0, + "reward": 0.9083333492279053, + "reward_std": 0.11098030507564545, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9083333492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17272518202662468, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999336123466491, + "sampling/importance_sampling_ratio/min": 0.35569711625576017, + "sampling/sampling_logp_difference/max": 1.0804112911224366, + "sampling/sampling_logp_difference/mean": 0.01464350800961256, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1867.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 1240.828125, + "completions/mean_terminated_length": 1240.828125, + "completions/min_length": 846.4, + "completions/min_terminated_length": 846.4, + "entropy": 0.28558818697929383, + "epoch": 2.027027027027027, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4648728668689728, + "learning_rate": 3.06639205233826e-07, + "loss": 0.003, + "num_tokens": 230669475.0, + "reward": 0.848437511920929, + "reward_std": 0.1070944607257843, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.848437511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25224395394325255, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000049948692322, + "sampling/importance_sampling_ratio/min": 0.31880449652671816, + "sampling/sampling_logp_difference/max": 1.497407031059265, + "sampling/sampling_logp_difference/mean": 0.014051317609846592, + "step": 1725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1756.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 1243.184375, + "completions/mean_terminated_length": 1243.184375, + "completions/min_length": 934.2, + "completions/min_terminated_length": 934.2, + "entropy": 0.2831280082464218, + "epoch": 2.0329024676850764, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7145424485206604, + "learning_rate": 3.060334383329295e-07, + "loss": -0.0021, + "num_tokens": 231379070.0, + "reward": 0.7850000143051148, + "reward_std": 0.10773698166012764, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7850000143051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3063537538051605, + "sampling/importance_sampling_ratio/max": 1.9744776964187623, + "sampling/importance_sampling_ratio/mean": 0.9998899936676026, + "sampling/importance_sampling_ratio/min": 0.27880694568157194, + "sampling/sampling_logp_difference/max": 1.4340813398361205, + "sampling/sampling_logp_difference/mean": 0.013902221620082856, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1843.4, + "completions/max_terminated_length": 1843.4, + "completions/mean_length": 1300.975, + "completions/mean_terminated_length": 1300.975, + "completions/min_length": 938.4, + "completions/min_terminated_length": 938.4, + "entropy": 0.2918659746646881, + "epoch": 2.0387779083431257, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6800016760826111, + "learning_rate": 3.0542767143203294e-07, + "loss": 0.0042, + "num_tokens": 232105430.0, + "reward": 0.6939583420753479, + "reward_std": 0.12365827858448028, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6939583420753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30231705904006956, + "sampling/importance_sampling_ratio/max": 1.9620429277420044, + "sampling/importance_sampling_ratio/mean": 0.9999876499176026, + "sampling/importance_sampling_ratio/min": 0.3293029397726059, + "sampling/sampling_logp_difference/max": 1.3565265893936158, + "sampling/sampling_logp_difference/mean": 0.01423037126660347, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1720.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 1240.153125, + "completions/mean_terminated_length": 1240.153125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 0.298739355802536, + "epoch": 2.044653349001175, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.5738968253135681, + "learning_rate": 3.0482190453113643e-07, + "loss": 0.0037, + "num_tokens": 232801799.0, + "reward": 0.8646875262260437, + "reward_std": 0.14013182669878005, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8646875262260437, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22773447036743164, + "sampling/importance_sampling_ratio/max": 1.963773488998413, + "sampling/importance_sampling_ratio/mean": 1.0000531673431396, + "sampling/importance_sampling_ratio/min": 0.3736042261123657, + "sampling/sampling_logp_difference/max": 1.0158153533935548, + "sampling/sampling_logp_difference/mean": 0.01465737223625183, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1745.2, + "completions/max_terminated_length": 1745.2, + "completions/mean_length": 1227.05625, + "completions/mean_terminated_length": 1227.05625, + "completions/min_length": 891.2, + "completions/min_terminated_length": 891.2, + "entropy": 0.2842448323965073, + "epoch": 2.0505287896592246, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5650268793106079, + "learning_rate": 3.0421613763023986e-07, + "loss": 0.0065, + "num_tokens": 233537097.0, + "reward": 0.8348958373069764, + "reward_std": 0.08204673230648041, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8348958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2639793872833252, + "sampling/importance_sampling_ratio/max": 1.9064828634262085, + "sampling/importance_sampling_ratio/mean": 1.0000011801719666, + "sampling/importance_sampling_ratio/min": 0.3541615068912506, + "sampling/sampling_logp_difference/max": 1.1652244329452515, + "sampling/sampling_logp_difference/mean": 0.014108946174383163, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1584.2, + "completions/max_terminated_length": 1584.2, + "completions/mean_length": 1150.646875, + "completions/mean_terminated_length": 1150.646875, + "completions/min_length": 816.2, + "completions/min_terminated_length": 816.2, + "entropy": 0.28300136923789976, + "epoch": 2.056404230317274, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.721430242061615, + "learning_rate": 3.036103707293433e-07, + "loss": 0.0096, + "num_tokens": 234240840.0, + "reward": 0.8783854365348815, + "reward_std": 0.09260771721601486, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8783854365348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23209483027458191, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999852180480957, + "sampling/importance_sampling_ratio/min": 0.27702509611845016, + "sampling/sampling_logp_difference/max": 1.4427910327911377, + "sampling/sampling_logp_difference/mean": 0.014436537213623524, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1795.6, + "completions/max_terminated_length": 1795.6, + "completions/mean_length": 1341.33125, + "completions/mean_terminated_length": 1341.33125, + "completions/min_length": 961.6, + "completions/min_terminated_length": 961.6, + "entropy": 0.2973055899143219, + "epoch": 2.062279670975323, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7037875056266785, + "learning_rate": 3.030046038284468e-07, + "loss": 0.0024, + "num_tokens": 235011698.0, + "reward": 0.8645833492279053, + "reward_std": 0.10491203367710114, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8645833492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2077777147293091, + "sampling/importance_sampling_ratio/max": 1.8878745079040526, + "sampling/importance_sampling_ratio/mean": 0.9999104261398315, + "sampling/importance_sampling_ratio/min": 0.27942183911800383, + "sampling/sampling_logp_difference/max": 1.43536217212677, + "sampling/sampling_logp_difference/mean": 0.014530559442937374, + "step": 1755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1723.2, + "completions/max_terminated_length": 1723.2, + "completions/mean_length": 1207.046875, + "completions/mean_terminated_length": 1207.046875, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "entropy": 0.28618607819080355, + "epoch": 2.0681551116333723, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4199642837047577, + "learning_rate": 3.023988369275502e-07, + "loss": -0.0002, + "num_tokens": 235711745.0, + "reward": 0.7817708373069763, + "reward_std": 0.06876285523176193, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7817708373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24962895214557648, + "sampling/importance_sampling_ratio/max": 1.9418248176574706, + "sampling/importance_sampling_ratio/mean": 0.9999440431594848, + "sampling/importance_sampling_ratio/min": 0.2679483711719513, + "sampling/sampling_logp_difference/max": 1.456888699531555, + "sampling/sampling_logp_difference/mean": 0.014030390232801438, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1463.4, + "completions/max_terminated_length": 1463.4, + "completions/mean_length": 1105.29375, + "completions/mean_terminated_length": 1105.29375, + "completions/min_length": 835.4, + "completions/min_terminated_length": 835.4, + "entropy": 0.2893945574760437, + "epoch": 2.074030552291422, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7728146314620972, + "learning_rate": 3.017930700266537e-07, + "loss": 0.0075, + "num_tokens": 236373455.0, + "reward": 0.7203125178813934, + "reward_std": 0.09382694512605667, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7203125178813934, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25324483811855314, + "sampling/importance_sampling_ratio/max": 1.998999786376953, + "sampling/importance_sampling_ratio/mean": 0.9999812364578247, + "sampling/importance_sampling_ratio/min": 0.25601265765726566, + "sampling/sampling_logp_difference/max": 2.0012857913970947, + "sampling/sampling_logp_difference/mean": 0.014354320801794529, + "step": 1765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1613.4, + "completions/max_terminated_length": 1613.4, + "completions/mean_length": 1165.15625, + "completions/mean_terminated_length": 1165.15625, + "completions/min_length": 871.0, + "completions/min_terminated_length": 871.0, + "entropy": 0.271235328912735, + "epoch": 2.079905992949471, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6383501887321472, + "learning_rate": 3.011873031257572e-07, + "loss": 0.0011, + "num_tokens": 237069169.0, + "reward": 0.7666146039962769, + "reward_std": 0.09578602537512779, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7666146039962769, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3138491868972778, + "sampling/importance_sampling_ratio/max": 1.8899298191070557, + "sampling/importance_sampling_ratio/mean": 0.9999959945678711, + "sampling/importance_sampling_ratio/min": 0.24239584915339946, + "sampling/sampling_logp_difference/max": 1.8334716081619262, + "sampling/sampling_logp_difference/mean": 0.013750493712723254, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1592.8, + "completions/max_terminated_length": 1592.8, + "completions/mean_length": 1175.80625, + "completions/mean_terminated_length": 1175.80625, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.277604877948761, + "epoch": 2.0857814336075204, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7303882241249084, + "learning_rate": 3.0058153622486064e-07, + "loss": 0.0019, + "num_tokens": 237758851.0, + "reward": 0.7031250119209289, + "reward_std": 0.12986374348402024, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7031250238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3014916732907295, + "sampling/importance_sampling_ratio/max": 1.9738976955413818, + "sampling/importance_sampling_ratio/mean": 1.0001816511154176, + "sampling/importance_sampling_ratio/min": 0.37753416895866393, + "sampling/sampling_logp_difference/max": 1.017386221885681, + "sampling/sampling_logp_difference/mean": 0.01398746259510517, + "step": 1775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.6, + "completions/max_terminated_length": 1622.6, + "completions/mean_length": 1112.41875, + "completions/mean_terminated_length": 1112.41875, + "completions/min_length": 820.0, + "completions/min_terminated_length": 820.0, + "entropy": 0.25624861419200895, + "epoch": 2.09165687426557, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.76189124584198, + "learning_rate": 2.9997576932396413e-07, + "loss": 0.0029, + "num_tokens": 238420953.0, + "reward": 0.8820833444595337, + "reward_std": 0.11259067952632903, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8820833563804626, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23160261511802674, + "sampling/importance_sampling_ratio/max": 1.8961010932922364, + "sampling/importance_sampling_ratio/mean": 1.0000606298446655, + "sampling/importance_sampling_ratio/min": 0.33668322265148165, + "sampling/sampling_logp_difference/max": 1.1480276823043822, + "sampling/sampling_logp_difference/mean": 0.013318390399217606, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1628.4, + "completions/max_terminated_length": 1628.4, + "completions/mean_length": 1162.1875, + "completions/mean_terminated_length": 1162.1875, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "entropy": 0.2680640757083893, + "epoch": 2.0975323149236194, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5976382493972778, + "learning_rate": 2.9937000242306757e-07, + "loss": -0.0004, + "num_tokens": 239096837.0, + "reward": 0.775781261920929, + "reward_std": 0.1137162283062935, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7757812738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32021387219429015, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001160860061646, + "sampling/importance_sampling_ratio/min": 0.3333402812480927, + "sampling/sampling_logp_difference/max": 1.1439990282058716, + "sampling/sampling_logp_difference/mean": 0.01385085079818964, + "step": 1785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1503.2, + "completions/max_terminated_length": 1503.2, + "completions/mean_length": 1124.1125, + "completions/mean_terminated_length": 1124.1125, + "completions/min_length": 875.2, + "completions/min_terminated_length": 875.2, + "entropy": 0.2674242615699768, + "epoch": 2.1034077555816686, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 2.9876423552217106e-07, + "loss": 0.0022, + "num_tokens": 239770009.0, + "reward": 0.7948437690734863, + "reward_std": 0.059596112370491026, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7948437690734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26889737248420714, + "sampling/importance_sampling_ratio/max": 1.8605868816375732, + "sampling/importance_sampling_ratio/mean": 1.0000072717666626, + "sampling/importance_sampling_ratio/min": 0.3218778297305107, + "sampling/sampling_logp_difference/max": 1.3112666606903076, + "sampling/sampling_logp_difference/mean": 0.013513725996017457, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1477.6, + "completions/max_terminated_length": 1477.6, + "completions/mean_length": 1035.5875, + "completions/mean_terminated_length": 1035.5875, + "completions/min_length": 697.8, + "completions/min_terminated_length": 697.8, + "entropy": 0.24554576575756074, + "epoch": 2.109283196239718, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6205167174339294, + "learning_rate": 2.9815846862127455e-07, + "loss": 0.0004, + "num_tokens": 240438149.0, + "reward": 0.7682291746139527, + "reward_std": 0.0869694009423256, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7682291746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3167840033769608, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999816417694092, + "sampling/importance_sampling_ratio/min": 0.39744282960891725, + "sampling/sampling_logp_difference/max": 1.184657597541809, + "sampling/sampling_logp_difference/mean": 0.013029644265770912, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1595.2, + "completions/max_terminated_length": 1595.2, + "completions/mean_length": 1139.05, + "completions/mean_terminated_length": 1139.05, + "completions/min_length": 796.6, + "completions/min_terminated_length": 796.6, + "entropy": 0.27050881683826444, + "epoch": 2.1151586368977675, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.6889888048171997, + "learning_rate": 2.97552701720378e-07, + "loss": 0.0004, + "num_tokens": 241151557.0, + "reward": 0.7417187571525574, + "reward_std": 0.157486841827631, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7417187690734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32728450894355776, + "sampling/importance_sampling_ratio/max": 1.932526707649231, + "sampling/importance_sampling_ratio/mean": 1.0000963449478149, + "sampling/importance_sampling_ratio/min": 0.346428656578064, + "sampling/sampling_logp_difference/max": 1.278468632698059, + "sampling/sampling_logp_difference/mean": 0.01399834081530571, + "step": 1800 + }, + { + "epoch": 2.1151586368977675, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1458.0, + "eval_completions/max_terminated_length": 1458.0, + "eval_completions/mean_length": 1057.511875, + "eval_completions/mean_terminated_length": 1057.511875, + "eval_completions/min_length": 779.68, + "eval_completions/min_terminated_length": 779.68, + "eval_entropy": 0.25933306515216825, + "eval_frac_reward_zero_std": 0.56, + "eval_loss": 0.005401823669672012, + "eval_num_tokens": 241151557.0, + "eval_reward": 0.729833345413208, + "eval_reward_std": 0.08868716955184937, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.729833345413208, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3078808444738388, + "eval_runtime": 406.5719, + "eval_samples_per_second": 0.246, + "eval_sampling/importance_sampling_ratio/max": 1.945242338180542, + "eval_sampling/importance_sampling_ratio/mean": 1.0000135159492494, + "eval_sampling/importance_sampling_ratio/min": 0.3708919485433554, + "eval_sampling/sampling_logp_difference/max": 1.7694089603424072, + "eval_sampling/sampling_logp_difference/mean": 0.013369039855897427, + "eval_steps_per_second": 0.005, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1519.4, + "completions/max_terminated_length": 1519.4, + "completions/mean_length": 1092.36875, + "completions/mean_terminated_length": 1092.36875, + "completions/min_length": 736.4, + "completions/min_terminated_length": 736.4, + "entropy": 0.24719617068767546, + "epoch": 2.1210340775558167, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.0, + "learning_rate": 2.9694693481948147e-07, + "loss": -0.0047, + "num_tokens": 241802075.0, + "reward": 0.7389062762260437, + "reward_std": 0.10075291693210602, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7389062762260437, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30727340281009674, + "sampling/importance_sampling_ratio/max": 1.9569374561309814, + "sampling/importance_sampling_ratio/mean": 0.9999837636947632, + "sampling/importance_sampling_ratio/min": 0.3740126609802246, + "sampling/sampling_logp_difference/max": 1.0094846487045288, + "sampling/sampling_logp_difference/mean": 0.01285008229315281, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1400.8, + "completions/max_terminated_length": 1400.8, + "completions/mean_length": 1044.871875, + "completions/mean_terminated_length": 1044.871875, + "completions/min_length": 732.0, + "completions/min_terminated_length": 732.0, + "entropy": 0.24336313009262084, + "epoch": 2.126909518213866, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4582900106906891, + "learning_rate": 2.963411679185849e-07, + "loss": 0.0039, + "num_tokens": 242455362.0, + "reward": 0.8213541746139527, + "reward_std": 0.08186697289347648, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8213541686534882, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21659500002861024, + "sampling/importance_sampling_ratio/max": 1.9403943777084351, + "sampling/importance_sampling_ratio/mean": 0.9999736547470093, + "sampling/importance_sampling_ratio/min": 0.2570742294192314, + "sampling/sampling_logp_difference/max": 1.5891379356384276, + "sampling/sampling_logp_difference/mean": 0.013036347925662994, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1502.8, + "completions/max_terminated_length": 1502.8, + "completions/mean_length": 1106.55625, + "completions/mean_terminated_length": 1106.55625, + "completions/min_length": 841.4, + "completions/min_terminated_length": 841.4, + "entropy": 0.25111518502235414, + "epoch": 2.132784958871915, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6569045186042786, + "learning_rate": 2.957354010176884e-07, + "loss": -0.0027, + "num_tokens": 243131412.0, + "reward": 0.8187500238418579, + "reward_std": 0.09741906523704529, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8187500238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2573035418987274, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001561999320985, + "sampling/importance_sampling_ratio/min": 0.29898915837402457, + "sampling/sampling_logp_difference/max": 2.6158031940460207, + "sampling/sampling_logp_difference/mean": 0.013235697895288468, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1578.2, + "completions/max_terminated_length": 1578.2, + "completions/mean_length": 1109.54375, + "completions/mean_terminated_length": 1109.54375, + "completions/min_length": 816.6, + "completions/min_terminated_length": 816.6, + "entropy": 0.2552741885185242, + "epoch": 2.138660399529965, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.8629297018051147, + "learning_rate": 2.951296341167919e-07, + "loss": 0.0104, + "num_tokens": 243818002.0, + "reward": 0.8177083611488343, + "reward_std": 0.16347545087337495, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8177083611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2514489233493805, + "sampling/importance_sampling_ratio/max": 1.9939273834228515, + "sampling/importance_sampling_ratio/mean": 0.9999308586120605, + "sampling/importance_sampling_ratio/min": 0.27947772494808304, + "sampling/sampling_logp_difference/max": 2.8928178787231444, + "sampling/sampling_logp_difference/mean": 0.013579142279922962, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1492.6, + "completions/max_terminated_length": 1492.6, + "completions/mean_length": 1132.365625, + "completions/mean_terminated_length": 1132.365625, + "completions/min_length": 836.8, + "completions/min_terminated_length": 836.8, + "entropy": 0.24973794519901277, + "epoch": 2.144535840188014, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.7742500305175781, + "learning_rate": 2.9452386721589527e-07, + "loss": 0.003, + "num_tokens": 244494887.0, + "reward": 0.8229166984558105, + "reward_std": 0.1541348308324814, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8229166984558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2586108475923538, + "sampling/importance_sampling_ratio/max": 1.8992940664291382, + "sampling/importance_sampling_ratio/mean": 1.0000396728515626, + "sampling/importance_sampling_ratio/min": 0.41788731813430785, + "sampling/sampling_logp_difference/max": 0.883245873451233, + "sampling/sampling_logp_difference/mean": 0.013097657822072506, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1717.2, + "completions/max_terminated_length": 1717.2, + "completions/mean_length": 1177.0, + "completions/mean_terminated_length": 1177.0, + "completions/min_length": 850.2, + "completions/min_terminated_length": 850.2, + "entropy": 0.26592395901679994, + "epoch": 2.1504112808460634, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8587401509284973, + "learning_rate": 2.9391810031499876e-07, + "loss": -0.0011, + "num_tokens": 245227175.0, + "reward": 0.7255208551883697, + "reward_std": 0.1329521119594574, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7255208551883697, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29523513913154603, + "sampling/importance_sampling_ratio/max": 1.981991744041443, + "sampling/importance_sampling_ratio/mean": 0.9999511480331421, + "sampling/importance_sampling_ratio/min": 0.29395384937524793, + "sampling/sampling_logp_difference/max": 1.3618286371231079, + "sampling/sampling_logp_difference/mean": 0.014108171686530113, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1515.2, + "completions/max_terminated_length": 1515.2, + "completions/mean_length": 1118.55, + "completions/mean_terminated_length": 1118.55, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.23380440473556519, + "epoch": 2.1562867215041126, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8163400888442993, + "learning_rate": 2.933123334141022e-07, + "loss": 0.0026, + "num_tokens": 245905831.0, + "reward": 0.9013021111488342, + "reward_std": 0.10105133950710296, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9013021111488342, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16864475458860398, + "sampling/importance_sampling_ratio/max": 1.9898765087127686, + "sampling/importance_sampling_ratio/mean": 1.000045382976532, + "sampling/importance_sampling_ratio/min": 0.25343484356999396, + "sampling/sampling_logp_difference/max": 1.7442261219024657, + "sampling/sampling_logp_difference/mean": 0.012684360519051552, + "step": 1835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1327.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 1012.203125, + "completions/mean_terminated_length": 1012.203125, + "completions/min_length": 780.2, + "completions/min_terminated_length": 780.2, + "entropy": 0.2402132272720337, + "epoch": 2.1621621621621623, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8790619969367981, + "learning_rate": 2.927065665132057e-07, + "loss": 0.0047, + "num_tokens": 246525720.0, + "reward": 0.8596354246139526, + "reward_std": 0.09730460494756699, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8596354365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25916803777217867, + "sampling/importance_sampling_ratio/max": 1.9251169681549072, + "sampling/importance_sampling_ratio/mean": 1.000064241886139, + "sampling/importance_sampling_ratio/min": 0.3161891311407089, + "sampling/sampling_logp_difference/max": 1.4003211498260497, + "sampling/sampling_logp_difference/mean": 0.012864516861736775, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1499.2, + "completions/max_terminated_length": 1499.2, + "completions/mean_length": 1105.99375, + "completions/mean_terminated_length": 1105.99375, + "completions/min_length": 857.2, + "completions/min_terminated_length": 857.2, + "entropy": 0.24603658318519592, + "epoch": 2.1680376028202115, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5251009464263916, + "learning_rate": 2.921007996123092e-07, + "loss": 0.0014, + "num_tokens": 247206918.0, + "reward": 0.6041666746139527, + "reward_std": 0.09626547321677208, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6041666746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.38971813917160036, + "sampling/importance_sampling_ratio/max": 1.990101408958435, + "sampling/importance_sampling_ratio/mean": 0.9999129176139832, + "sampling/importance_sampling_ratio/min": 0.37296884059906005, + "sampling/sampling_logp_difference/max": 1.1782666206359864, + "sampling/sampling_logp_difference/mean": 0.012996607273817063, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1508.2, + "completions/max_terminated_length": 1508.2, + "completions/mean_length": 1116.371875, + "completions/mean_terminated_length": 1116.371875, + "completions/min_length": 844.4, + "completions/min_terminated_length": 844.4, + "entropy": 0.2545777499675751, + "epoch": 2.1739130434782608, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6853423714637756, + "learning_rate": 2.914950327114126e-07, + "loss": -0.0086, + "num_tokens": 247856413.0, + "reward": 0.8486979246139527, + "reward_std": 0.11036976650357247, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8486979246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.250604185461998, + "sampling/importance_sampling_ratio/max": 1.9809516191482544, + "sampling/importance_sampling_ratio/mean": 0.9998941421508789, + "sampling/importance_sampling_ratio/min": 0.20959659069776534, + "sampling/sampling_logp_difference/max": 1.7914276361465453, + "sampling/sampling_logp_difference/mean": 0.013322827219963074, + "step": 1850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1446.4, + "completions/max_terminated_length": 1446.4, + "completions/mean_length": 1050.86875, + "completions/mean_terminated_length": 1050.86875, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.22533740401268004, + "epoch": 2.17978848413631, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6656336188316345, + "learning_rate": 2.908892658105161e-07, + "loss": 0.0003, + "num_tokens": 248524867.0, + "reward": 0.8348958611488342, + "reward_std": 0.0947174459695816, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8348958611488342, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2426920771598816, + "sampling/importance_sampling_ratio/max": 1.9969067335128785, + "sampling/importance_sampling_ratio/mean": 1.0000614404678345, + "sampling/importance_sampling_ratio/min": 0.3773179233074188, + "sampling/sampling_logp_difference/max": 1.1439204216003418, + "sampling/sampling_logp_difference/mean": 0.012491510808467865, + "step": 1855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1496.4, + "completions/max_terminated_length": 1496.4, + "completions/mean_length": 1021.7625, + "completions/mean_terminated_length": 1021.7625, + "completions/min_length": 762.0, + "completions/min_terminated_length": 762.0, + "entropy": 0.2335586816072464, + "epoch": 2.1856639247943597, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.501934289932251, + "learning_rate": 2.9028349890961954e-07, + "loss": -0.0001, + "num_tokens": 249182711.0, + "reward": 0.728906261920929, + "reward_std": 0.07847404927015304, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.728906261920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36090049147605896, + "sampling/importance_sampling_ratio/max": 1.8162980318069457, + "sampling/importance_sampling_ratio/mean": 0.9999779343605042, + "sampling/importance_sampling_ratio/min": 0.36308351159095764, + "sampling/sampling_logp_difference/max": 1.0326343655586243, + "sampling/sampling_logp_difference/mean": 0.012726966850459575, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1422.2, + "completions/max_terminated_length": 1422.2, + "completions/mean_length": 1061.034375, + "completions/mean_terminated_length": 1061.034375, + "completions/min_length": 775.2, + "completions/min_terminated_length": 775.2, + "entropy": 0.23590830862522125, + "epoch": 2.191539365452409, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.39766162633895874, + "learning_rate": 2.89677732008723e-07, + "loss": 0.0094, + "num_tokens": 249855890.0, + "reward": 0.7854166746139526, + "reward_std": 0.08812462836503983, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7854166746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32434244602918627, + "sampling/importance_sampling_ratio/max": 1.972498369216919, + "sampling/importance_sampling_ratio/mean": 0.9998962163925171, + "sampling/importance_sampling_ratio/min": 0.3102016121149063, + "sampling/sampling_logp_difference/max": 1.3020723462104797, + "sampling/sampling_logp_difference/mean": 0.013067025132477284, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1469.4, + "completions/max_terminated_length": 1469.4, + "completions/mean_length": 1075.846875, + "completions/mean_terminated_length": 1075.846875, + "completions/min_length": 755.2, + "completions/min_terminated_length": 755.2, + "entropy": 0.2341611683368683, + "epoch": 2.197414806110458, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7978026866912842, + "learning_rate": 2.890719651078265e-07, + "loss": 0.0006, + "num_tokens": 250511697.0, + "reward": 0.7937500238418579, + "reward_std": 0.145812551677227, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7937500238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3256185740232468, + "sampling/importance_sampling_ratio/max": 1.9232592821121215, + "sampling/importance_sampling_ratio/mean": 0.9999657511711121, + "sampling/importance_sampling_ratio/min": 0.3169225871562958, + "sampling/sampling_logp_difference/max": 1.1892317295074464, + "sampling/sampling_logp_difference/mean": 0.012808991968631745, + "step": 1870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1369.4, + "completions/max_terminated_length": 1369.4, + "completions/mean_length": 993.428125, + "completions/mean_terminated_length": 993.428125, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.22252206802368163, + "epoch": 2.203290246768508, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4887593984603882, + "learning_rate": 2.8846619820692995e-07, + "loss": 0.0004, + "num_tokens": 251148570.0, + "reward": 0.8028125166893005, + "reward_std": 0.07628065943717957, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8028125166893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22741907089948654, + "sampling/importance_sampling_ratio/max": 1.9813610076904298, + "sampling/importance_sampling_ratio/mean": 0.9999027252197266, + "sampling/importance_sampling_ratio/min": 0.3122305542230606, + "sampling/sampling_logp_difference/max": 1.2302313566207885, + "sampling/sampling_logp_difference/mean": 0.012417474761605263, + "step": 1875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1859.2, + "completions/max_terminated_length": 1857.8, + "completions/mean_length": 1118.25, + "completions/mean_terminated_length": 1105.8674560546874, + "completions/min_length": 749.8, + "completions/min_terminated_length": 749.8, + "entropy": 0.21847314536571502, + "epoch": 2.209165687426557, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8039262294769287, + "learning_rate": 2.8786043130603344e-07, + "loss": -0.0113, + "num_tokens": 251849278.0, + "reward": 0.8166666865348816, + "reward_std": 0.13573960661888124, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8166666865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23900896161794663, + "sampling/importance_sampling_ratio/max": 1.887087082862854, + "sampling/importance_sampling_ratio/mean": 0.9999661207199096, + "sampling/importance_sampling_ratio/min": 0.3232264846563339, + "sampling/sampling_logp_difference/max": 1.1839786291122436, + "sampling/sampling_logp_difference/mean": 0.012019017711281776, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1323.8, + "completions/max_terminated_length": 1323.8, + "completions/mean_length": 988.34375, + "completions/mean_terminated_length": 988.34375, + "completions/min_length": 726.4, + "completions/min_terminated_length": 726.4, + "entropy": 0.2140260010957718, + "epoch": 2.2150411280846063, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.655967652797699, + "learning_rate": 2.8725466440513693e-07, + "loss": -0.0032, + "num_tokens": 252451276.0, + "reward": 0.884375, + "reward_std": 0.05798763036727905, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8843750119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23959563821554183, + "sampling/importance_sampling_ratio/max": 1.9973509311676025, + "sampling/importance_sampling_ratio/mean": 0.999974501132965, + "sampling/importance_sampling_ratio/min": 0.3508861005679928, + "sampling/sampling_logp_difference/max": 5.175508713722229, + "sampling/sampling_logp_difference/mean": 0.011773454025387764, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1479.0, + "completions/max_terminated_length": 1479.0, + "completions/mean_length": 1093.64375, + "completions/mean_terminated_length": 1093.64375, + "completions/min_length": 754.6, + "completions/min_terminated_length": 754.6, + "entropy": 0.2178718239068985, + "epoch": 2.2209165687426555, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.7926600575447083, + "learning_rate": 2.8664889750424037e-07, + "loss": -0.0018, + "num_tokens": 253140282.0, + "reward": 0.8203125, + "reward_std": 0.10103268027305604, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8203125119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2567353665828705, + "sampling/importance_sampling_ratio/max": 1.9315465211868286, + "sampling/importance_sampling_ratio/mean": 1.0000144124031067, + "sampling/importance_sampling_ratio/min": 0.3091754883527756, + "sampling/sampling_logp_difference/max": 1.2065674543380738, + "sampling/sampling_logp_difference/mean": 0.012078930810093879, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1514.6, + "completions/max_terminated_length": 1514.6, + "completions/mean_length": 1054.10625, + "completions/mean_terminated_length": 1054.10625, + "completions/min_length": 697.4, + "completions/min_terminated_length": 697.4, + "entropy": 0.23007131218910218, + "epoch": 2.226792009400705, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6741881966590881, + "learning_rate": 2.8604313060334386e-07, + "loss": -0.0008, + "num_tokens": 253784924.0, + "reward": 0.800000011920929, + "reward_std": 0.06402734369039535, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.800000011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26122883558273313, + "sampling/importance_sampling_ratio/max": 1.8661206007003783, + "sampling/importance_sampling_ratio/mean": 1.0000409960746766, + "sampling/importance_sampling_ratio/min": 0.35816158950328825, + "sampling/sampling_logp_difference/max": 1.1304970264434815, + "sampling/sampling_logp_difference/mean": 0.012475567311048508, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1391.8, + "completions/max_terminated_length": 1391.8, + "completions/mean_length": 1053.721875, + "completions/mean_terminated_length": 1053.721875, + "completions/min_length": 827.8, + "completions/min_terminated_length": 827.8, + "entropy": 0.2240033119916916, + "epoch": 2.2326674500587544, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6372376680374146, + "learning_rate": 2.854373637024473e-07, + "loss": 0.0012, + "num_tokens": 254444355.0, + "reward": 0.8333333373069763, + "reward_std": 0.09950350672006607, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8333333373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2455697923898697, + "sampling/importance_sampling_ratio/max": 1.9715130090713502, + "sampling/importance_sampling_ratio/mean": 0.9999606847763062, + "sampling/importance_sampling_ratio/min": 0.32975890338420866, + "sampling/sampling_logp_difference/max": 1.141264510154724, + "sampling/sampling_logp_difference/mean": 0.012219694256782532, + "step": 1900 + }, + { + "epoch": 2.2326674500587544, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.000625, + "eval_completions/max_length": 1452.08, + "eval_completions/max_terminated_length": 1445.28, + "eval_completions/mean_length": 1023.909375, + "eval_completions/mean_terminated_length": 1023.0136328125, + "eval_completions/min_length": 751.44, + "eval_completions/min_terminated_length": 751.44, + "eval_entropy": 0.23475720524787902, + "eval_frac_reward_zero_std": 0.53, + "eval_loss": -0.0010693141957744956, + "eval_num_tokens": 254444355.0, + "eval_reward": 0.7352083444595336, + "eval_reward_std": 0.09326890490949154, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7352083444595336, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.30484940618276596, + "eval_runtime": 394.069, + "eval_samples_per_second": 0.254, + "eval_sampling/importance_sampling_ratio/max": 1.9632924747467042, + "eval_sampling/importance_sampling_ratio/mean": 0.9999413275718689, + "eval_sampling/importance_sampling_ratio/min": 0.31770970672369003, + "eval_sampling/sampling_logp_difference/max": 1.298856236934662, + "eval_sampling/sampling_logp_difference/mean": 0.012761585600674152, + "eval_steps_per_second": 0.005, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1544.0, + "completions/max_terminated_length": 1544.0, + "completions/mean_length": 1024.86875, + "completions/mean_terminated_length": 1024.86875, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "entropy": 0.24251948595046996, + "epoch": 2.2385428907168037, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6644178628921509, + "learning_rate": 2.848315968015508e-07, + "loss": 0.0004, + "num_tokens": 255080473.0, + "reward": 0.857812511920929, + "reward_std": 0.08356589004397393, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.857812511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24869934916496278, + "sampling/importance_sampling_ratio/max": 1.9640849590301515, + "sampling/importance_sampling_ratio/mean": 0.9999147415161133, + "sampling/importance_sampling_ratio/min": 0.31352718770503996, + "sampling/sampling_logp_difference/max": 1.1885489702224732, + "sampling/sampling_logp_difference/mean": 0.01301195491105318, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1718.2, + "completions/max_terminated_length": 1665.8, + "completions/mean_length": 1055.325, + "completions/mean_terminated_length": 1046.8310302734376, + "completions/min_length": 715.4, + "completions/min_terminated_length": 715.4, + "entropy": 0.22508153021335603, + "epoch": 2.244418331374853, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.37735798954963684, + "learning_rate": 2.8422582990065416e-07, + "loss": -0.0044, + "num_tokens": 255740601.0, + "reward": 0.7631250143051147, + "reward_std": 0.0876377984881401, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7631250143051147, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2765495449304581, + "sampling/importance_sampling_ratio/max": 1.9813214778900146, + "sampling/importance_sampling_ratio/mean": 0.9998460412025452, + "sampling/importance_sampling_ratio/min": 0.36637800335884096, + "sampling/sampling_logp_difference/max": 1.1442196130752564, + "sampling/sampling_logp_difference/mean": 0.012507494539022446, + "step": 1910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1471.6, + "completions/max_terminated_length": 1471.6, + "completions/mean_length": 1069.365625, + "completions/mean_terminated_length": 1069.365625, + "completions/min_length": 731.2, + "completions/min_terminated_length": 731.2, + "entropy": 0.23527201116085053, + "epoch": 2.2502937720329026, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4593348801136017, + "learning_rate": 2.8362006299975765e-07, + "loss": 0.0089, + "num_tokens": 256408926.0, + "reward": 0.776562511920929, + "reward_std": 0.12371757328510284, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.776562511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28311349302530286, + "sampling/importance_sampling_ratio/max": 1.9783401966094971, + "sampling/importance_sampling_ratio/mean": 1.0000746846199036, + "sampling/importance_sampling_ratio/min": 0.2895603716373444, + "sampling/sampling_logp_difference/max": 1.3389286994934082, + "sampling/sampling_logp_difference/mean": 0.012912764959037305, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1559.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 1112.6125, + "completions/mean_terminated_length": 1112.6125, + "completions/min_length": 776.2, + "completions/min_terminated_length": 776.2, + "entropy": 0.23903321325778962, + "epoch": 2.256169212690952, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4446127116680145, + "learning_rate": 2.8301429609886114e-07, + "loss": 0.0022, + "num_tokens": 257103634.0, + "reward": 0.8364583611488342, + "reward_std": 0.1059862032532692, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8364583611488342, + "rewards/e2e_recall_precision_mixed_reward/std": 0.258354589343071, + "sampling/importance_sampling_ratio/max": 1.9829919576644897, + "sampling/importance_sampling_ratio/mean": 0.9999783992767334, + "sampling/importance_sampling_ratio/min": 0.2826462507247925, + "sampling/sampling_logp_difference/max": 1.4443536758422852, + "sampling/sampling_logp_difference/mean": 0.013122853077948093, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1565.4, + "completions/max_terminated_length": 1565.4, + "completions/mean_length": 1123.990625, + "completions/mean_terminated_length": 1123.990625, + "completions/min_length": 871.4, + "completions/min_terminated_length": 871.4, + "entropy": 0.22474170625209808, + "epoch": 2.262044653349001, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4290720820426941, + "learning_rate": 2.824085291979646e-07, + "loss": 0.0004, + "num_tokens": 257766399.0, + "reward": 0.8651041746139526, + "reward_std": 0.061245692521333696, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8651041746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25154909715056417, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001497983932495, + "sampling/importance_sampling_ratio/min": 0.2948518693447113, + "sampling/sampling_logp_difference/max": 1.2655499935150147, + "sampling/sampling_logp_difference/mean": 0.01205264199525118, + "step": 1925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 1148.1375, + "completions/mean_terminated_length": 1148.1375, + "completions/min_length": 848.4, + "completions/min_terminated_length": 848.4, + "entropy": 0.239599347114563, + "epoch": 2.2679200940070503, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6222954392433167, + "learning_rate": 2.8180276229706807e-07, + "loss": -0.0026, + "num_tokens": 258515195.0, + "reward": 0.7656250059604645, + "reward_std": 0.09615588523447513, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7656250059604645, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2964589536190033, + "sampling/importance_sampling_ratio/max": 1.9878407716751099, + "sampling/importance_sampling_ratio/mean": 1.0000786304473877, + "sampling/importance_sampling_ratio/min": 0.25952497124671936, + "sampling/sampling_logp_difference/max": 1.3694255113601685, + "sampling/sampling_logp_difference/mean": 0.013026346825063229, + "step": 1930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1531.2, + "completions/max_terminated_length": 1531.2, + "completions/mean_length": 1101.3125, + "completions/mean_terminated_length": 1101.3125, + "completions/min_length": 838.2, + "completions/min_terminated_length": 838.2, + "entropy": 0.24195704162120818, + "epoch": 2.2737955346651, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7305625677108765, + "learning_rate": 2.8119699539617156e-07, + "loss": -0.004, + "num_tokens": 259177519.0, + "reward": 0.8244270920753479, + "reward_std": 0.09137791246175767, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8244270920753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22105640172958374, + "sampling/importance_sampling_ratio/max": 1.9939350605010986, + "sampling/importance_sampling_ratio/mean": 0.9998633027076721, + "sampling/importance_sampling_ratio/min": 0.3458234578371048, + "sampling/sampling_logp_difference/max": 1.3980162143707275, + "sampling/sampling_logp_difference/mean": 0.013009026646614075, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1313.6, + "completions/max_terminated_length": 1313.6, + "completions/mean_length": 983.740625, + "completions/mean_terminated_length": 983.740625, + "completions/min_length": 751.2, + "completions/min_terminated_length": 751.2, + "entropy": 0.2223033517599106, + "epoch": 2.279670975323149, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4795616567134857, + "learning_rate": 2.80591228495275e-07, + "loss": -0.001, + "num_tokens": 259834556.0, + "reward": 0.7640625178813935, + "reward_std": 0.08006698191165924, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7640625178813935, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3285771101713181, + "sampling/importance_sampling_ratio/max": 1.9821281909942627, + "sampling/importance_sampling_ratio/mean": 1.0000587821006774, + "sampling/importance_sampling_ratio/min": 0.41876710653305055, + "sampling/sampling_logp_difference/max": 0.8904253721237183, + "sampling/sampling_logp_difference/mean": 0.01218780390918255, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1554.6, + "completions/max_terminated_length": 1554.6, + "completions/mean_length": 1059.265625, + "completions/mean_terminated_length": 1059.265625, + "completions/min_length": 740.6, + "completions/min_terminated_length": 740.6, + "entropy": 0.24593849778175353, + "epoch": 2.2855464159811985, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.9010393619537354, + "learning_rate": 2.799854615943785e-07, + "loss": 0.0023, + "num_tokens": 260492209.0, + "reward": 0.7838541865348816, + "reward_std": 0.14079142212867737, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7838541865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.301932692527771, + "sampling/importance_sampling_ratio/max": 1.9539946079254151, + "sampling/importance_sampling_ratio/mean": 1.0001062393188476, + "sampling/importance_sampling_ratio/min": 0.37328195571899414, + "sampling/sampling_logp_difference/max": 1.0297700881958007, + "sampling/sampling_logp_difference/mean": 0.013037090376019477, + "step": 1945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1615.0, + "completions/max_terminated_length": 1615.0, + "completions/mean_length": 1110.025, + "completions/mean_terminated_length": 1110.025, + "completions/min_length": 782.0, + "completions/min_terminated_length": 782.0, + "entropy": 0.23949267864227294, + "epoch": 2.291421856639248, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5884570479393005, + "learning_rate": 2.793796946934819e-07, + "loss": 0.0058, + "num_tokens": 261155113.0, + "reward": 0.8239583492279052, + "reward_std": 0.07468674443662167, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8239583492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2608910098671913, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999876737594604, + "sampling/importance_sampling_ratio/min": 0.350416773557663, + "sampling/sampling_logp_difference/max": 1.0721632480621337, + "sampling/sampling_logp_difference/mean": 0.012836766429245472, + "step": 1950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1462.6, + "completions/max_terminated_length": 1462.6, + "completions/mean_length": 1101.3125, + "completions/mean_terminated_length": 1101.3125, + "completions/min_length": 853.8, + "completions/min_terminated_length": 853.8, + "entropy": 0.24034703075885772, + "epoch": 2.2972972972972974, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.44212859869003296, + "learning_rate": 2.787739277925854e-07, + "loss": 0.0027, + "num_tokens": 261805885.0, + "reward": 0.7859375, + "reward_std": 0.05160985812544823, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7859375, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3121504485607147, + "sampling/importance_sampling_ratio/max": 1.9134503841400146, + "sampling/importance_sampling_ratio/mean": 0.9998910069465637, + "sampling/importance_sampling_ratio/min": 0.36197959780693056, + "sampling/sampling_logp_difference/max": 1.118364405632019, + "sampling/sampling_logp_difference/mean": 0.012630455754697322, + "step": 1955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1554.6, + "completions/max_terminated_length": 1554.6, + "completions/mean_length": 1110.775, + "completions/mean_terminated_length": 1110.775, + "completions/min_length": 830.0, + "completions/min_terminated_length": 830.0, + "entropy": 0.24084980189800262, + "epoch": 2.3031727379553466, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8059061765670776, + "learning_rate": 2.781681608916889e-07, + "loss": -0.0017, + "num_tokens": 262485749.0, + "reward": 0.7385416746139526, + "reward_std": 0.09334568008780479, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7385416746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3025936484336853, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000214099884033, + "sampling/importance_sampling_ratio/min": 0.3065564423799515, + "sampling/sampling_logp_difference/max": 1.4883403539657594, + "sampling/sampling_logp_difference/mean": 0.013054091669619083, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1720.4, + "completions/max_terminated_length": 1720.4, + "completions/mean_length": 1106.85625, + "completions/mean_terminated_length": 1106.85625, + "completions/min_length": 745.6, + "completions/min_terminated_length": 745.6, + "entropy": 0.2630400389432907, + "epoch": 2.309048178613396, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 2.7756239399079234e-07, + "loss": 0.0032, + "num_tokens": 263183159.0, + "reward": 0.7591145873069763, + "reward_std": 0.04956208989024162, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7591145873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.33793588876724245, + "sampling/importance_sampling_ratio/max": 1.9396537780761718, + "sampling/importance_sampling_ratio/mean": 0.9998874068260193, + "sampling/importance_sampling_ratio/min": 0.40306171774864197, + "sampling/sampling_logp_difference/max": 0.9383793830871582, + "sampling/sampling_logp_difference/mean": 0.013850330747663975, + "step": 1965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1515.6, + "completions/max_terminated_length": 1515.6, + "completions/mean_length": 1097.1875, + "completions/mean_terminated_length": 1097.1875, + "completions/min_length": 781.4, + "completions/min_terminated_length": 781.4, + "entropy": 0.2332218050956726, + "epoch": 2.3149236192714455, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8961458802223206, + "learning_rate": 2.769566270898958e-07, + "loss": 0.0008, + "num_tokens": 263877299.0, + "reward": 0.8614583611488342, + "reward_std": 0.12068969756364822, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8614583611488342, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24193784296512605, + "sampling/importance_sampling_ratio/max": 1.9547238111495973, + "sampling/importance_sampling_ratio/mean": 0.999898374080658, + "sampling/importance_sampling_ratio/min": 0.4016565144062042, + "sampling/sampling_logp_difference/max": 0.9715569138526916, + "sampling/sampling_logp_difference/mean": 0.012774857692420483, + "step": 1970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1533.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1126.8375, + "completions/mean_terminated_length": 1126.8375, + "completions/min_length": 795.8, + "completions/min_terminated_length": 795.8, + "entropy": 0.2287152588367462, + "epoch": 2.3207990599294948, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.3858145773410797, + "learning_rate": 2.7635086018899926e-07, + "loss": 0.0003, + "num_tokens": 264564719.0, + "reward": 0.8640625238418579, + "reward_std": 0.08302248492836953, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8640625238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20304519385099412, + "sampling/importance_sampling_ratio/max": 1.984503149986267, + "sampling/importance_sampling_ratio/mean": 0.9999793767929077, + "sampling/importance_sampling_ratio/min": 0.39244469404220583, + "sampling/sampling_logp_difference/max": 1.0014652013778687, + "sampling/sampling_logp_difference/mean": 0.012451635673642159, + "step": 1975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.028125, + "completions/max_length": 1970.2, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 1229.9, + "completions/mean_terminated_length": 1199.3093505859374, + "completions/min_length": 792.4, + "completions/min_terminated_length": 792.4, + "entropy": 0.22849614918231964, + "epoch": 2.326674500587544, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5069258213043213, + "learning_rate": 2.7574509328810275e-07, + "loss": -0.0273, + "num_tokens": 265271995.0, + "reward": 0.7555208563804626, + "reward_std": 0.0813896507024765, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7555208563804626, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34186806380748747, + "sampling/importance_sampling_ratio/max": 1.9778911828994752, + "sampling/importance_sampling_ratio/mean": 0.9999969363212585, + "sampling/importance_sampling_ratio/min": 0.27652696073055266, + "sampling/sampling_logp_difference/max": 1.351327657699585, + "sampling/sampling_logp_difference/mean": 0.012531153298914433, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1454.6, + "completions/max_terminated_length": 1454.6, + "completions/mean_length": 1057.096875, + "completions/mean_terminated_length": 1057.096875, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.22988880872726442, + "epoch": 2.3325499412455932, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6030939817428589, + "learning_rate": 2.7513932638720624e-07, + "loss": 0.0031, + "num_tokens": 265935338.0, + "reward": 0.782812523841858, + "reward_std": 0.09311963804066181, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.782812523841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3007162183523178, + "sampling/importance_sampling_ratio/max": 1.9878717422485352, + "sampling/importance_sampling_ratio/mean": 1.0000318169593811, + "sampling/importance_sampling_ratio/min": 0.26722107380628585, + "sampling/sampling_logp_difference/max": 1.5060338497161865, + "sampling/sampling_logp_difference/mean": 0.012719903513789177, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1966.2, + "completions/max_terminated_length": 1966.2, + "completions/mean_length": 1223.465625, + "completions/mean_terminated_length": 1223.465625, + "completions/min_length": 884.8, + "completions/min_terminated_length": 884.8, + "entropy": 0.2435948759317398, + "epoch": 2.338425381903643, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 2.745335594863096e-07, + "loss": -0.0007, + "num_tokens": 266639487.0, + "reward": 0.838281261920929, + "reward_std": 0.09026078432798386, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.838281261920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2680289790034294, + "sampling/importance_sampling_ratio/max": 1.9598480701446532, + "sampling/importance_sampling_ratio/mean": 0.9998942852020264, + "sampling/importance_sampling_ratio/min": 0.2704477931372821, + "sampling/sampling_logp_difference/max": 2.0783373355865478, + "sampling/sampling_logp_difference/mean": 0.013020814210176469, + "step": 1990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1626.4, + "completions/max_terminated_length": 1626.4, + "completions/mean_length": 1127.540625, + "completions/mean_terminated_length": 1127.540625, + "completions/min_length": 709.0, + "completions/min_terminated_length": 709.0, + "entropy": 0.24840072691440582, + "epoch": 2.344300822561692, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5901416540145874, + "learning_rate": 2.739277925854131e-07, + "loss": 0.0035, + "num_tokens": 267330828.0, + "reward": 0.7151041865348816, + "reward_std": 0.08581260442733765, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7151041865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3605960875749588, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001015663146973, + "sampling/importance_sampling_ratio/min": 0.42087502479553224, + "sampling/sampling_logp_difference/max": 1.0152322053909302, + "sampling/sampling_logp_difference/mean": 0.013321847841143607, + "step": 1995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1965.6, + "completions/max_terminated_length": 1852.4, + "completions/mean_length": 1199.00625, + "completions/mean_terminated_length": 1191.6684326171876, + "completions/min_length": 788.4, + "completions/min_terminated_length": 788.4, + "entropy": 0.2513920724391937, + "epoch": 2.3501762632197414, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5038850903511047, + "learning_rate": 2.7332202568451655e-07, + "loss": -0.0232, + "num_tokens": 268033542.0, + "reward": 0.8557291746139526, + "reward_std": 0.07250104248523712, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8557291746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21458423137664795, + "sampling/importance_sampling_ratio/max": 1.9932072162628174, + "sampling/importance_sampling_ratio/mean": 1.0000502467155457, + "sampling/importance_sampling_ratio/min": 0.37983145117759703, + "sampling/sampling_logp_difference/max": 1.0120579957962037, + "sampling/sampling_logp_difference/mean": 0.013376428000628948, + "step": 2000 + }, + { + "epoch": 2.3501762632197414, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1608.16, + "eval_completions/max_terminated_length": 1608.16, + "eval_completions/mean_length": 1127.00875, + "eval_completions/mean_terminated_length": 1127.00875, + "eval_completions/min_length": 836.88, + "eval_completions/min_terminated_length": 836.88, + "eval_entropy": 0.25097706019878385, + "eval_frac_reward_zero_std": 0.54, + "eval_loss": 0.0010900250636041164, + "eval_num_tokens": 268033542.0, + "eval_reward": 0.7470312607288361, + "eval_reward_std": 0.08885785259306431, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7470312631130218, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.30599226742982866, + "eval_runtime": 439.0898, + "eval_samples_per_second": 0.228, + "eval_sampling/importance_sampling_ratio/max": 1.9520465230941773, + "eval_sampling/importance_sampling_ratio/mean": 0.9999663019180298, + "eval_sampling/importance_sampling_ratio/min": 0.33184033348748926, + "eval_sampling/sampling_logp_difference/max": 1.5774301075935364, + "eval_sampling/sampling_logp_difference/mean": 0.013306757099926472, + "eval_steps_per_second": 0.005, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1531.6, + "completions/max_terminated_length": 1531.6, + "completions/mean_length": 1145.7625, + "completions/mean_terminated_length": 1145.7625, + "completions/min_length": 822.0, + "completions/min_terminated_length": 822.0, + "entropy": 0.2629284977912903, + "epoch": 2.3560517038777906, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6058534383773804, + "learning_rate": 2.7271625878362004e-07, + "loss": 0.0053, + "num_tokens": 268706202.0, + "reward": 0.7458333492279052, + "reward_std": 0.08412181735038757, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7458333492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28866009414196014, + "sampling/importance_sampling_ratio/max": 1.9702019453048707, + "sampling/importance_sampling_ratio/mean": 1.0000141739845276, + "sampling/importance_sampling_ratio/min": 0.2838461309671402, + "sampling/sampling_logp_difference/max": 1.3169232606887817, + "sampling/sampling_logp_difference/mean": 0.013618256896734238, + "step": 2005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1781.0, + "completions/max_terminated_length": 1781.0, + "completions/mean_length": 1223.86875, + "completions/mean_terminated_length": 1223.86875, + "completions/min_length": 888.2, + "completions/min_terminated_length": 888.2, + "entropy": 0.24649737477302552, + "epoch": 2.3619271445358403, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6726218461990356, + "learning_rate": 2.7211049188272353e-07, + "loss": -0.0022, + "num_tokens": 269407984.0, + "reward": 0.8861979365348815, + "reward_std": 0.0921474851667881, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8861979365348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22615295350551606, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998706579208374, + "sampling/importance_sampling_ratio/min": 0.2436885952949721, + "sampling/sampling_logp_difference/max": 7.224675178527832, + "sampling/sampling_logp_difference/mean": 0.013193446025252343, + "step": 2010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1376.2, + "completions/max_terminated_length": 1376.2, + "completions/mean_length": 1066.721875, + "completions/mean_terminated_length": 1066.721875, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "entropy": 0.23765726387500763, + "epoch": 2.3678025851938895, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.8555007576942444, + "learning_rate": 2.7150472498182696e-07, + "loss": -0.0028, + "num_tokens": 270033255.0, + "reward": 0.9101562619209289, + "reward_std": 0.07478788420557976, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9101562619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13179679438471795, + "sampling/importance_sampling_ratio/max": 1.9186473608016967, + "sampling/importance_sampling_ratio/mean": 0.9999523162841797, + "sampling/importance_sampling_ratio/min": 0.3806654095649719, + "sampling/sampling_logp_difference/max": 1.0193307399749756, + "sampling/sampling_logp_difference/mean": 0.012833572551608085, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1591.8, + "completions/max_terminated_length": 1591.8, + "completions/mean_length": 1153.6875, + "completions/mean_terminated_length": 1153.6875, + "completions/min_length": 900.2, + "completions/min_terminated_length": 900.2, + "entropy": 0.2594814360141754, + "epoch": 2.3736780258519388, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.44845935702323914, + "learning_rate": 2.7089895808093045e-07, + "loss": 0.0057, + "num_tokens": 270722227.0, + "reward": 0.8557291984558105, + "reward_std": 0.11051277071237564, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8557291984558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2391131788492203, + "sampling/importance_sampling_ratio/max": 1.9824217557907104, + "sampling/importance_sampling_ratio/mean": 0.9999794125556946, + "sampling/importance_sampling_ratio/min": 0.3954480618238449, + "sampling/sampling_logp_difference/max": 1.0284236431121827, + "sampling/sampling_logp_difference/mean": 0.013628228195011615, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1617.8, + "completions/max_terminated_length": 1617.8, + "completions/mean_length": 1185.45, + "completions/mean_terminated_length": 1185.45, + "completions/min_length": 843.6, + "completions/min_terminated_length": 843.6, + "entropy": 0.2511540025472641, + "epoch": 2.3795534665099884, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.442772775888443, + "learning_rate": 2.702931911800339e-07, + "loss": 0.0026, + "num_tokens": 271446675.0, + "reward": 0.8489583492279053, + "reward_std": 0.08587736189365387, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8489583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23618671298027039, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000007688999176, + "sampling/importance_sampling_ratio/min": 0.1813358840532601, + "sampling/sampling_logp_difference/max": 2.310008430480957, + "sampling/sampling_logp_difference/mean": 0.013332589715719222, + "step": 2025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1752.6, + "completions/max_terminated_length": 1752.6, + "completions/mean_length": 1194.28125, + "completions/mean_terminated_length": 1194.28125, + "completions/min_length": 903.8, + "completions/min_terminated_length": 903.8, + "entropy": 0.25064473152160643, + "epoch": 2.3854289071680377, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.45136576890945435, + "learning_rate": 2.696874242791374e-07, + "loss": -0.0014, + "num_tokens": 272145981.0, + "reward": 0.7092187583446503, + "reward_std": 0.10091326609253884, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7092187583446503, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2888171553611755, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999277353286743, + "sampling/importance_sampling_ratio/min": 0.2797494070604444, + "sampling/sampling_logp_difference/max": 2.1815228700637816, + "sampling/sampling_logp_difference/mean": 0.013193762302398682, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1767.8, + "completions/max_terminated_length": 1767.8, + "completions/mean_length": 1246.7625, + "completions/mean_terminated_length": 1246.7625, + "completions/min_length": 885.8, + "completions/min_terminated_length": 885.8, + "entropy": 0.2690925747156143, + "epoch": 2.391304347826087, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6555709838867188, + "learning_rate": 2.6908165737824087e-07, + "loss": 0.0005, + "num_tokens": 272869025.0, + "reward": 0.878125011920929, + "reward_std": 0.05481426417827606, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.878125011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20230115056037903, + "sampling/importance_sampling_ratio/max": 1.9401631832122803, + "sampling/importance_sampling_ratio/mean": 0.9999709963798523, + "sampling/importance_sampling_ratio/min": 0.2493190795183533, + "sampling/sampling_logp_difference/max": 6.924089002609253, + "sampling/sampling_logp_difference/mean": 0.013956866040825845, + "step": 2035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1756.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 1192.321875, + "completions/mean_terminated_length": 1192.321875, + "completions/min_length": 823.6, + "completions/min_terminated_length": 823.6, + "entropy": 0.25292410254478453, + "epoch": 2.397179788484136, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0, + "learning_rate": 2.684758904773443e-07, + "loss": 0.0022, + "num_tokens": 273569704.0, + "reward": 0.9161458492279053, + "reward_std": 0.08007604032754898, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9161458492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17229452580213547, + "sampling/importance_sampling_ratio/max": 1.9493489265441895, + "sampling/importance_sampling_ratio/mean": 0.999928891658783, + "sampling/importance_sampling_ratio/min": 0.3117185816168785, + "sampling/sampling_logp_difference/max": 1.3051450490951537, + "sampling/sampling_logp_difference/mean": 0.013440205156803131, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1764.6, + "completions/max_terminated_length": 1764.6, + "completions/mean_length": 1189.453125, + "completions/mean_terminated_length": 1189.453125, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "entropy": 0.2691080868244171, + "epoch": 2.403055229142186, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7951878905296326, + "learning_rate": 2.678701235764478e-07, + "loss": 0.0033, + "num_tokens": 274263145.0, + "reward": 0.8606770873069763, + "reward_std": 0.06981215178966522, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8606770873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2001673936843872, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000786066055298, + "sampling/importance_sampling_ratio/min": 0.2740499794483185, + "sampling/sampling_logp_difference/max": 1.6949212074279785, + "sampling/sampling_logp_difference/mean": 0.013879082910716534, + "step": 2045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1926.8, + "completions/max_terminated_length": 1926.8, + "completions/mean_length": 1224.5125, + "completions/mean_terminated_length": 1224.5125, + "completions/min_length": 907.0, + "completions/min_terminated_length": 907.0, + "entropy": 0.27390618324279786, + "epoch": 2.408930669800235, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4256892204284668, + "learning_rate": 2.6726435667555123e-07, + "loss": 0.0096, + "num_tokens": 275021357.0, + "reward": 0.911718773841858, + "reward_std": 0.07080408632755279, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.911718773841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1760246217250824, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999222755432129, + "sampling/importance_sampling_ratio/min": 0.3092021256685257, + "sampling/sampling_logp_difference/max": 1.2760996580123902, + "sampling/sampling_logp_difference/mean": 0.01414750088006258, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1639.6, + "completions/max_terminated_length": 1639.6, + "completions/mean_length": 1126.825, + "completions/mean_terminated_length": 1126.825, + "completions/min_length": 749.2, + "completions/min_terminated_length": 749.2, + "entropy": 0.2581260442733765, + "epoch": 2.4148061104582843, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.45884257555007935, + "learning_rate": 2.666585897746547e-07, + "loss": 0.004, + "num_tokens": 275683397.0, + "reward": 0.8592708349227905, + "reward_std": 0.05926808714866638, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8592708349227905, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23570542484521867, + "sampling/importance_sampling_ratio/max": 1.9564772129058838, + "sampling/importance_sampling_ratio/mean": 0.9999432563781738, + "sampling/importance_sampling_ratio/min": 0.332023561000824, + "sampling/sampling_logp_difference/max": 1.158114504814148, + "sampling/sampling_logp_difference/mean": 0.013491682521998883, + "step": 2055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1776.4, + "completions/max_terminated_length": 1776.4, + "completions/mean_length": 1245.8375, + "completions/mean_terminated_length": 1245.8375, + "completions/min_length": 906.2, + "completions/min_terminated_length": 906.2, + "entropy": 0.26880968511104586, + "epoch": 2.4206815511163335, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5789676904678345, + "learning_rate": 2.660528228737582e-07, + "loss": 0.005, + "num_tokens": 276430721.0, + "reward": 0.8739583373069764, + "reward_std": 0.06670975238084793, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8739583373069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23361384719610215, + "sampling/importance_sampling_ratio/max": 1.954941201210022, + "sampling/importance_sampling_ratio/mean": 1.000061297416687, + "sampling/importance_sampling_ratio/min": 0.31866523027420046, + "sampling/sampling_logp_difference/max": 1.3414917469024659, + "sampling/sampling_logp_difference/mean": 0.013910824991762638, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1777.4, + "completions/max_terminated_length": 1777.4, + "completions/mean_length": 1222.33125, + "completions/mean_terminated_length": 1222.33125, + "completions/min_length": 792.4, + "completions/min_terminated_length": 792.4, + "entropy": 0.27435516715049746, + "epoch": 2.426556991774383, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.47820526361465454, + "learning_rate": 2.6544705597286165e-07, + "loss": 0.0092, + "num_tokens": 277175051.0, + "reward": 0.8295312643051147, + "reward_std": 0.11600432693958282, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8295312643051147, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28728988468647004, + "sampling/importance_sampling_ratio/max": 1.9975449800491334, + "sampling/importance_sampling_ratio/mean": 0.9999809026718139, + "sampling/importance_sampling_ratio/min": 0.3298905849456787, + "sampling/sampling_logp_difference/max": 1.2696374893188476, + "sampling/sampling_logp_difference/mean": 0.014326155558228493, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1779.2, + "completions/max_terminated_length": 1779.2, + "completions/mean_length": 1233.875, + "completions/mean_terminated_length": 1233.875, + "completions/min_length": 863.8, + "completions/min_terminated_length": 863.8, + "entropy": 0.26715349555015566, + "epoch": 2.4324324324324325, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.46884390711784363, + "learning_rate": 2.648412890719651e-07, + "loss": -0.0021, + "num_tokens": 277894115.0, + "reward": 0.8973958373069764, + "reward_std": 0.06307865604758263, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8973958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.135838021337986, + "sampling/importance_sampling_ratio/max": 1.9918049335479737, + "sampling/importance_sampling_ratio/mean": 1.0000259518623351, + "sampling/importance_sampling_ratio/min": 0.2770086288452148, + "sampling/sampling_logp_difference/max": 1.4482748746871947, + "sampling/sampling_logp_difference/mean": 0.013979560136795044, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 2042.6, + "completions/max_terminated_length": 1876.8, + "completions/mean_length": 1274.6, + "completions/mean_terminated_length": 1267.1865966796875, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "entropy": 0.26166120171546936, + "epoch": 2.4383078730904817, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5546722412109375, + "learning_rate": 2.642355221710685e-07, + "loss": -0.0208, + "num_tokens": 278637739.0, + "reward": 0.8460416674613953, + "reward_std": 0.07623862028121949, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8460416674613953, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22280243337154387, + "sampling/importance_sampling_ratio/max": 1.9055000066757202, + "sampling/importance_sampling_ratio/mean": 0.9998693823814392, + "sampling/importance_sampling_ratio/min": 0.3392042249441147, + "sampling/sampling_logp_difference/max": 1.152932047843933, + "sampling/sampling_logp_difference/mean": 0.013772268965840339, + "step": 2075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1897.2, + "completions/max_terminated_length": 1842.2, + "completions/mean_length": 1271.5375, + "completions/mean_terminated_length": 1267.7753173828125, + "completions/min_length": 902.8, + "completions/min_terminated_length": 902.8, + "entropy": 0.2623803198337555, + "epoch": 2.444183313748531, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.679646372795105, + "learning_rate": 2.63629755270172e-07, + "loss": -0.003, + "num_tokens": 279384675.0, + "reward": 0.6657291650772095, + "reward_std": 0.14221025630831718, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6657291650772095, + "rewards/e2e_recall_precision_mixed_reward/std": 0.38955762386322024, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999815106391907, + "sampling/importance_sampling_ratio/min": 0.3362751841545105, + "sampling/sampling_logp_difference/max": 1.1838315725326538, + "sampling/sampling_logp_difference/mean": 0.013782840594649315, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1718.2, + "completions/max_terminated_length": 1718.2, + "completions/mean_length": 1200.925, + "completions/mean_terminated_length": 1200.925, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "entropy": 0.26449680924415586, + "epoch": 2.4500587544065806, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6856433749198914, + "learning_rate": 2.630239883692755e-07, + "loss": -0.0049, + "num_tokens": 280092555.0, + "reward": 0.8588541746139526, + "reward_std": 0.08078035041689872, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8588541746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17709082067012788, + "sampling/importance_sampling_ratio/max": 1.936391544342041, + "sampling/importance_sampling_ratio/mean": 1.0000238418579102, + "sampling/importance_sampling_ratio/min": 0.3401082783937454, + "sampling/sampling_logp_difference/max": 1.2312336444854737, + "sampling/sampling_logp_difference/mean": 0.013856485113501548, + "step": 2085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1737.6, + "completions/max_terminated_length": 1737.6, + "completions/mean_length": 1294.9875, + "completions/mean_terminated_length": 1294.9875, + "completions/min_length": 969.4, + "completions/min_terminated_length": 969.4, + "entropy": 0.2628588765859604, + "epoch": 2.45593419506463, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5648078918457031, + "learning_rate": 2.6241822146837893e-07, + "loss": 0.0025, + "num_tokens": 280810599.0, + "reward": 0.8257812619209289, + "reward_std": 0.0739155262708664, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8257812619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29022433459758756, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999966311454773, + "sampling/importance_sampling_ratio/min": 0.20766952782869338, + "sampling/sampling_logp_difference/max": 1.7227513313293457, + "sampling/sampling_logp_difference/mean": 0.013284758664667606, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1860.0, + "completions/max_terminated_length": 1860.0, + "completions/mean_length": 1239.553125, + "completions/mean_terminated_length": 1239.553125, + "completions/min_length": 862.2, + "completions/min_terminated_length": 862.2, + "entropy": 0.2770519554615021, + "epoch": 2.461809635722679, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.43463632464408875, + "learning_rate": 2.618124545674824e-07, + "loss": 0.0122, + "num_tokens": 281529608.0, + "reward": 0.7165104269981384, + "reward_std": 0.07144647724926471, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7165104269981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32212514281272886, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000048518180846, + "sampling/importance_sampling_ratio/min": 0.366385692358017, + "sampling/sampling_logp_difference/max": 1.152387547492981, + "sampling/sampling_logp_difference/mean": 0.014233771339058876, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1768.6, + "completions/max_terminated_length": 1768.6, + "completions/mean_length": 1226.5125, + "completions/mean_terminated_length": 1226.5125, + "completions/min_length": 857.6, + "completions/min_terminated_length": 857.6, + "entropy": 0.2612621784210205, + "epoch": 2.4676850763807288, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.3621138632297516, + "learning_rate": 2.6120668766658586e-07, + "loss": -0.0037, + "num_tokens": 282204684.0, + "reward": 0.8427083492279053, + "reward_std": 0.07418096661567689, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8427083492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22203905284404754, + "sampling/importance_sampling_ratio/max": 1.9696928262710571, + "sampling/importance_sampling_ratio/mean": 0.9999515414237976, + "sampling/importance_sampling_ratio/min": 0.32047736793756487, + "sampling/sampling_logp_difference/max": 1.3083037376403808, + "sampling/sampling_logp_difference/mean": 0.013291514292359353, + "step": 2100 + }, + { + "epoch": 2.4676850763807288, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.000625, + "eval_completions/max_length": 1706.68, + "eval_completions/max_terminated_length": 1705.28, + "eval_completions/mean_length": 1182.015625, + "eval_completions/mean_terminated_length": 1181.2093798828125, + "eval_completions/min_length": 865.84, + "eval_completions/min_terminated_length": 865.84, + "eval_entropy": 0.2664040964841843, + "eval_frac_reward_zero_std": 0.6, + "eval_loss": -0.0011476209620013833, + "eval_num_tokens": 282204684.0, + "eval_reward": 0.7496771001815796, + "eval_reward_std": 0.08451524078845978, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7496771001815796, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3025799497961998, + "eval_runtime": 464.0933, + "eval_samples_per_second": 0.215, + "eval_sampling/importance_sampling_ratio/max": 1.923496961593628, + "eval_sampling/importance_sampling_ratio/mean": 0.9999792790412902, + "eval_sampling/importance_sampling_ratio/min": 0.3148359860479832, + "eval_sampling/sampling_logp_difference/max": 1.4014044141769408, + "eval_sampling/sampling_logp_difference/mean": 0.013713168315589427, + "eval_steps_per_second": 0.004, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1992.8, + "completions/max_terminated_length": 1992.8, + "completions/mean_length": 1313.50625, + "completions/mean_terminated_length": 1313.50625, + "completions/min_length": 933.2, + "completions/min_terminated_length": 933.2, + "entropy": 0.2658017486333847, + "epoch": 2.473560517038778, + "frac_reward_zero_std": 0.55, + "grad_norm": 1.3053785562515259, + "learning_rate": 2.6060092076568935e-07, + "loss": -0.0029, + "num_tokens": 282926030.0, + "reward": 0.8159895896911621, + "reward_std": 0.10865789279341698, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8159895896911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.277511340379715, + "sampling/importance_sampling_ratio/max": 1.9750770807266236, + "sampling/importance_sampling_ratio/mean": 1.0000683188438415, + "sampling/importance_sampling_ratio/min": 0.35810833275318144, + "sampling/sampling_logp_difference/max": 1.1295485258102418, + "sampling/sampling_logp_difference/mean": 0.013428233750164508, + "step": 2105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1765.4, + "completions/max_terminated_length": 1765.4, + "completions/mean_length": 1209.60625, + "completions/mean_terminated_length": 1209.60625, + "completions/min_length": 854.2, + "completions/min_terminated_length": 854.2, + "entropy": 0.2738288462162018, + "epoch": 2.4794359576968272, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4568771719932556, + "learning_rate": 2.5999515386479284e-07, + "loss": 0.0041, + "num_tokens": 283648064.0, + "reward": 0.835156261920929, + "reward_std": 0.07496549636125564, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.835156261920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21661584377288817, + "sampling/importance_sampling_ratio/max": 1.8551706790924072, + "sampling/importance_sampling_ratio/mean": 0.9999988913536072, + "sampling/importance_sampling_ratio/min": 0.3922689139842987, + "sampling/sampling_logp_difference/max": 0.945212459564209, + "sampling/sampling_logp_difference/mean": 0.014047329686582089, + "step": 2110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1461.0, + "completions/max_terminated_length": 1461.0, + "completions/mean_length": 1102.090625, + "completions/mean_terminated_length": 1102.090625, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "entropy": 0.24425167739391326, + "epoch": 2.4853113983548765, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5156678557395935, + "learning_rate": 2.593893869638963e-07, + "loss": -0.0031, + "num_tokens": 284362237.0, + "reward": 0.7421875238418579, + "reward_std": 0.07766841053962707, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7421875238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3157324016094208, + "sampling/importance_sampling_ratio/max": 1.9938726425170898, + "sampling/importance_sampling_ratio/mean": 0.9999809503555298, + "sampling/importance_sampling_ratio/min": 0.3194470554590225, + "sampling/sampling_logp_difference/max": 1.3359221458435058, + "sampling/sampling_logp_difference/mean": 0.013078029081225396, + "step": 2115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1704.8, + "completions/max_terminated_length": 1704.8, + "completions/mean_length": 1215.6125, + "completions/mean_terminated_length": 1215.6125, + "completions/min_length": 815.6, + "completions/min_terminated_length": 815.6, + "entropy": 0.28841713070869446, + "epoch": 2.491186839012926, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5904704332351685, + "learning_rate": 2.5878362006299976e-07, + "loss": -0.0011, + "num_tokens": 285089361.0, + "reward": 0.7183854222297669, + "reward_std": 0.08566985726356506, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7183854222297669, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3276542216539383, + "sampling/importance_sampling_ratio/max": 1.9800874948501588, + "sampling/importance_sampling_ratio/mean": 1.00006422996521, + "sampling/importance_sampling_ratio/min": 0.28533509075641633, + "sampling/sampling_logp_difference/max": 1.3343998670578003, + "sampling/sampling_logp_difference/mean": 0.014605691842734813, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1797.0, + "completions/max_terminated_length": 1701.8, + "completions/mean_length": 1217.321875, + "completions/mean_terminated_length": 1213.289013671875, + "completions/min_length": 903.0, + "completions/min_terminated_length": 903.0, + "entropy": 0.26930312514305116, + "epoch": 2.4970622796709754, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6213445067405701, + "learning_rate": 2.581778531621032e-07, + "loss": -0.0049, + "num_tokens": 285775444.0, + "reward": 0.8020833611488343, + "reward_std": 0.13593244403600693, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8020833611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28477261066436765, + "sampling/importance_sampling_ratio/max": 1.9315150022506713, + "sampling/importance_sampling_ratio/mean": 0.9999381899833679, + "sampling/importance_sampling_ratio/min": 0.34449381977319715, + "sampling/sampling_logp_difference/max": 1.3405115842819213, + "sampling/sampling_logp_difference/mean": 0.013642283342778682, + "step": 2125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 2057.2, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1294.4125, + "completions/mean_terminated_length": 1291.0941650390625, + "completions/min_length": 868.2, + "completions/min_terminated_length": 868.2, + "entropy": 0.26585444808006287, + "epoch": 2.5029377203290246, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6043793559074402, + "learning_rate": 2.575720862612067e-07, + "loss": 0.009, + "num_tokens": 286531988.0, + "reward": 0.8127604246139526, + "reward_std": 0.09144037812948227, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8127604246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2378230720758438, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999789953231811, + "sampling/importance_sampling_ratio/min": 0.34047214686870575, + "sampling/sampling_logp_difference/max": 1.1415401697158813, + "sampling/sampling_logp_difference/mean": 0.013809867203235626, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1787.8, + "completions/max_terminated_length": 1787.8, + "completions/mean_length": 1196.678125, + "completions/mean_terminated_length": 1196.678125, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "entropy": 0.26385495364665984, + "epoch": 2.5088131609870743, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4344102740287781, + "learning_rate": 2.569663193603102e-07, + "loss": 0.0036, + "num_tokens": 287241901.0, + "reward": 0.7539583444595337, + "reward_std": 0.09427325129508972, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7539583444595337, + "rewards/e2e_recall_precision_mixed_reward/std": 0.350679612159729, + "sampling/importance_sampling_ratio/max": 1.8801328659057617, + "sampling/importance_sampling_ratio/mean": 1.0000503182411193, + "sampling/importance_sampling_ratio/min": 0.29096491932868956, + "sampling/sampling_logp_difference/max": 1.2413696765899658, + "sampling/sampling_logp_difference/mean": 0.013599349372088908, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1791.0, + "completions/max_terminated_length": 1791.0, + "completions/mean_length": 1240.184375, + "completions/mean_terminated_length": 1240.184375, + "completions/min_length": 815.2, + "completions/min_terminated_length": 815.2, + "entropy": 0.2642264664173126, + "epoch": 2.5146886016451235, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.4803769588470459, + "learning_rate": 2.563605524594136e-07, + "loss": -0.0011, + "num_tokens": 287960408.0, + "reward": 0.7619791865348816, + "reward_std": 0.10066340118646622, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7619791984558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2697122097015381, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999095678329468, + "sampling/importance_sampling_ratio/min": 0.22195086255669594, + "sampling/sampling_logp_difference/max": 1.8092245578765869, + "sampling/sampling_logp_difference/mean": 0.013568362034857274, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1848.6, + "completions/max_terminated_length": 1848.6, + "completions/mean_length": 1211.046875, + "completions/mean_terminated_length": 1211.046875, + "completions/min_length": 914.0, + "completions/min_terminated_length": 914.0, + "entropy": 0.2546990215778351, + "epoch": 2.5205640423031728, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.43225082755088806, + "learning_rate": 2.557547855585171e-07, + "loss": -0.0026, + "num_tokens": 288649303.0, + "reward": 0.8182291865348816, + "reward_std": 0.08594144955277443, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8182291865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24452958032488822, + "sampling/importance_sampling_ratio/max": 1.9426300525665283, + "sampling/importance_sampling_ratio/mean": 1.0000913858413696, + "sampling/importance_sampling_ratio/min": 0.28692914694547655, + "sampling/sampling_logp_difference/max": 1.36034255027771, + "sampling/sampling_logp_difference/mean": 0.013059111684560776, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1521.6, + "completions/max_terminated_length": 1521.6, + "completions/mean_length": 1158.971875, + "completions/mean_terminated_length": 1158.971875, + "completions/min_length": 846.4, + "completions/min_terminated_length": 846.4, + "entropy": 0.2745051383972168, + "epoch": 2.526439482961222, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.663104772567749, + "learning_rate": 2.5514901865762054e-07, + "loss": 0.0025, + "num_tokens": 289380430.0, + "reward": 0.8361458539962768, + "reward_std": 0.10924627855420113, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8361458539962768, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2811933159828186, + "sampling/importance_sampling_ratio/max": 1.9657442808151244, + "sampling/importance_sampling_ratio/mean": 0.9999931931495667, + "sampling/importance_sampling_ratio/min": 0.39454739093780516, + "sampling/sampling_logp_difference/max": 0.9744221210479737, + "sampling/sampling_logp_difference/mean": 0.014131250046193599, + "step": 2150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1660.0, + "completions/max_terminated_length": 1660.0, + "completions/mean_length": 1141.4625, + "completions/mean_terminated_length": 1141.4625, + "completions/min_length": 798.2, + "completions/min_terminated_length": 798.2, + "entropy": 0.2594828069210052, + "epoch": 2.5323149236192712, + "frac_reward_zero_std": 0.45, + "grad_norm": 1.1496278047561646, + "learning_rate": 2.54543251756724e-07, + "loss": 0.0095, + "num_tokens": 290082946.0, + "reward": 0.86953125, + "reward_std": 0.10211466997861862, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.869531261920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22828298360109328, + "sampling/importance_sampling_ratio/max": 1.914345669746399, + "sampling/importance_sampling_ratio/mean": 0.9999447941780091, + "sampling/importance_sampling_ratio/min": 0.33225939571857455, + "sampling/sampling_logp_difference/max": 1.1551615476608277, + "sampling/sampling_logp_difference/mean": 0.013497978821396828, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1804.8, + "completions/max_terminated_length": 1804.8, + "completions/mean_length": 1214.8125, + "completions/mean_terminated_length": 1214.8125, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.2724804818630219, + "epoch": 2.538190364277321, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.44894957542419434, + "learning_rate": 2.5393748485582747e-07, + "loss": 0.0016, + "num_tokens": 290774182.0, + "reward": 0.909375011920929, + "reward_std": 0.06227758340537548, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.909375011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18587272763252258, + "sampling/importance_sampling_ratio/max": 1.8864954710006714, + "sampling/importance_sampling_ratio/mean": 0.9999249219894409, + "sampling/importance_sampling_ratio/min": 0.313013830780983, + "sampling/sampling_logp_difference/max": 1.5233005046844483, + "sampling/sampling_logp_difference/mean": 0.013677316159009934, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1856.6, + "completions/max_terminated_length": 1856.6, + "completions/mean_length": 1215.196875, + "completions/mean_terminated_length": 1215.196875, + "completions/min_length": 869.4, + "completions/min_terminated_length": 869.4, + "entropy": 0.26370081305503845, + "epoch": 2.54406580493537, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.583694577217102, + "learning_rate": 2.533317179549309e-07, + "loss": 0.0099, + "num_tokens": 291523045.0, + "reward": 0.7243229269981384, + "reward_std": 0.07695610821247101, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7243229269981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2652903199195862, + "sampling/importance_sampling_ratio/max": 1.9399638891220092, + "sampling/importance_sampling_ratio/mean": 1.000074291229248, + "sampling/importance_sampling_ratio/min": 0.25444764718413354, + "sampling/sampling_logp_difference/max": 1.7197787404060363, + "sampling/sampling_logp_difference/mean": 0.013512972928583621, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1562.8, + "completions/max_terminated_length": 1562.8, + "completions/mean_length": 1180.584375, + "completions/mean_terminated_length": 1180.584375, + "completions/min_length": 923.2, + "completions/min_terminated_length": 923.2, + "entropy": 0.25344176292419435, + "epoch": 2.5499412455934194, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.4013759195804596, + "learning_rate": 2.527259510540344e-07, + "loss": 0.0009, + "num_tokens": 292235200.0, + "reward": 0.818750011920929, + "reward_std": 0.04898076355457306, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.818750011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25387853682041167, + "sampling/importance_sampling_ratio/max": 1.9529661893844605, + "sampling/importance_sampling_ratio/mean": 1.0000003099441528, + "sampling/importance_sampling_ratio/min": 0.333037468791008, + "sampling/sampling_logp_difference/max": 1.4715055704116822, + "sampling/sampling_logp_difference/mean": 0.013367529399693013, + "step": 2170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1429.2, + "completions/max_terminated_length": 1429.2, + "completions/mean_length": 1136.7625, + "completions/mean_terminated_length": 1136.7625, + "completions/min_length": 874.8, + "completions/min_terminated_length": 874.8, + "entropy": 0.2680712938308716, + "epoch": 2.555816686251469, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.7385175228118896, + "learning_rate": 2.5212018415313783e-07, + "loss": 0.0054, + "num_tokens": 292919940.0, + "reward": 0.8784374952316284, + "reward_std": 0.06511118579655886, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8784374952316284, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20323096066713334, + "sampling/importance_sampling_ratio/max": 1.9584675550460815, + "sampling/importance_sampling_ratio/mean": 1.0000271797180176, + "sampling/importance_sampling_ratio/min": 0.32080017030239105, + "sampling/sampling_logp_difference/max": 1.3096740007400514, + "sampling/sampling_logp_difference/mean": 0.013850261084735393, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1661.6, + "completions/max_terminated_length": 1661.6, + "completions/mean_length": 1151.040625, + "completions/mean_terminated_length": 1151.040625, + "completions/min_length": 855.2, + "completions/min_terminated_length": 855.2, + "entropy": 0.25901271402835846, + "epoch": 2.5616921269095183, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.416144460439682, + "learning_rate": 2.515144172522413e-07, + "loss": -0.0003, + "num_tokens": 293619409.0, + "reward": 0.7942708492279053, + "reward_std": 0.10647799670696259, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7942708492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2655137896537781, + "sampling/importance_sampling_ratio/max": 1.9906936645507813, + "sampling/importance_sampling_ratio/mean": 0.9999905347824096, + "sampling/importance_sampling_ratio/min": 0.3820026218891144, + "sampling/sampling_logp_difference/max": 1.1053644180297852, + "sampling/sampling_logp_difference/mean": 0.013533038832247258, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1738.6, + "completions/max_terminated_length": 1738.6, + "completions/mean_length": 1254.803125, + "completions/mean_terminated_length": 1254.803125, + "completions/min_length": 936.8, + "completions/min_terminated_length": 936.8, + "entropy": 0.2685577243566513, + "epoch": 2.5675675675675675, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6036714315414429, + "learning_rate": 2.509086503513448e-07, + "loss": -0.0023, + "num_tokens": 294351490.0, + "reward": 0.871875, + "reward_std": 0.10807717889547348, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.871875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25218716263771057, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000145792961121, + "sampling/importance_sampling_ratio/min": 0.334485599398613, + "sampling/sampling_logp_difference/max": 1.3058324337005616, + "sampling/sampling_logp_difference/mean": 0.013811002299189568, + "step": 2185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1710.2, + "completions/max_terminated_length": 1710.2, + "completions/mean_length": 1185.6, + "completions/mean_terminated_length": 1185.6, + "completions/min_length": 926.2, + "completions/min_terminated_length": 926.2, + "entropy": 0.2519607961177826, + "epoch": 2.573443008225617, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.429861843585968, + "learning_rate": 2.5030288345044824e-07, + "loss": 0.0046, + "num_tokens": 295068802.0, + "reward": 0.6942708432674408, + "reward_std": 0.0552545927464962, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6942708432674408, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28056047260761263, + "sampling/importance_sampling_ratio/max": 1.9829387187957763, + "sampling/importance_sampling_ratio/mean": 1.0000043034553527, + "sampling/importance_sampling_ratio/min": 0.39192359447479247, + "sampling/sampling_logp_difference/max": 0.969468641281128, + "sampling/sampling_logp_difference/mean": 0.013157148286700249, + "step": 2190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1570.6, + "completions/max_terminated_length": 1570.6, + "completions/mean_length": 1153.78125, + "completions/mean_terminated_length": 1153.78125, + "completions/min_length": 820.0, + "completions/min_terminated_length": 820.0, + "entropy": 0.25543263256549836, + "epoch": 2.579318448883666, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.484397828578949, + "learning_rate": 2.4969711654955173e-07, + "loss": 0.0083, + "num_tokens": 295767884.0, + "reward": 0.7614583492279052, + "reward_std": 0.10435431525111198, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7614583492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2718092933297157, + "sampling/importance_sampling_ratio/max": 1.9637941360473632, + "sampling/importance_sampling_ratio/mean": 0.9999634385108948, + "sampling/importance_sampling_ratio/min": 0.3485614687204361, + "sampling/sampling_logp_difference/max": 1.1356598377227782, + "sampling/sampling_logp_difference/mean": 0.013540227897465229, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1515.8, + "completions/max_terminated_length": 1515.8, + "completions/mean_length": 1139.3125, + "completions/mean_terminated_length": 1139.3125, + "completions/min_length": 892.4, + "completions/min_terminated_length": 892.4, + "entropy": 0.24383938014507295, + "epoch": 2.5851938895417157, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.8983497023582458, + "learning_rate": 2.4909134964865517e-07, + "loss": 0.0002, + "num_tokens": 296434144.0, + "reward": 0.7223958492279052, + "reward_std": 0.12622348368167877, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7223958492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29917227625846865, + "sampling/importance_sampling_ratio/max": 1.972477102279663, + "sampling/importance_sampling_ratio/mean": 0.9999940752983093, + "sampling/importance_sampling_ratio/min": 0.3770484387874603, + "sampling/sampling_logp_difference/max": 1.139905858039856, + "sampling/sampling_logp_difference/mean": 0.01284611839801073, + "step": 2200 + }, + { + "epoch": 2.5851938895417157, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.000625, + "eval_completions/max_length": 1606.24, + "eval_completions/max_terminated_length": 1601.2, + "eval_completions/mean_length": 1145.1075, + "eval_completions/mean_terminated_length": 1144.29046875, + "eval_completions/min_length": 848.8, + "eval_completions/min_terminated_length": 848.8, + "eval_entropy": 0.26203078508377076, + "eval_frac_reward_zero_std": 0.62, + "eval_loss": 0.0016268673352897167, + "eval_num_tokens": 296434144.0, + "eval_reward": 0.7521041774749756, + "eval_reward_std": 0.0784831927716732, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7521041774749756, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29837510973215103, + "eval_runtime": 437.465, + "eval_samples_per_second": 0.229, + "eval_sampling/importance_sampling_ratio/max": 1.9403350448608399, + "eval_sampling/importance_sampling_ratio/mean": 1.000009639263153, + "eval_sampling/importance_sampling_ratio/min": 0.3114258821308613, + "eval_sampling/sampling_logp_difference/max": 1.4294465684890747, + "eval_sampling/sampling_logp_difference/mean": 0.013600032962858676, + "eval_steps_per_second": 0.005, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1612.0, + "completions/max_terminated_length": 1612.0, + "completions/mean_length": 1124.58125, + "completions/mean_terminated_length": 1124.58125, + "completions/min_length": 762.4, + "completions/min_terminated_length": 762.4, + "entropy": 0.2596117079257965, + "epoch": 2.591069330199765, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.47432422637939453, + "learning_rate": 2.4848558274775866e-07, + "loss": 0.0061, + "num_tokens": 297112778.0, + "reward": 0.8008854269981385, + "reward_std": 0.06376661993563175, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8008854269981385, + "rewards/e2e_recall_precision_mixed_reward/std": 0.33332130759954454, + "sampling/importance_sampling_ratio/max": 1.938689661026001, + "sampling/importance_sampling_ratio/mean": 0.9999597907066345, + "sampling/importance_sampling_ratio/min": 0.3991494715213776, + "sampling/sampling_logp_difference/max": 1.0992213487625122, + "sampling/sampling_logp_difference/mean": 0.013456992991268634, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 1105.88125, + "completions/mean_terminated_length": 1105.88125, + "completions/min_length": 864.0, + "completions/min_terminated_length": 864.0, + "entropy": 0.24657217562198638, + "epoch": 2.5969447708578146, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0, + "learning_rate": 2.4787981584686215e-07, + "loss": -0.004, + "num_tokens": 297772052.0, + "reward": 0.8242187619209289, + "reward_std": 0.10624904036521912, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.82421875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20129311084747314, + "sampling/importance_sampling_ratio/max": 1.959466028213501, + "sampling/importance_sampling_ratio/mean": 1.0000609636306763, + "sampling/importance_sampling_ratio/min": 0.339236655831337, + "sampling/sampling_logp_difference/max": 1.203588342666626, + "sampling/sampling_logp_difference/mean": 0.013020346313714981, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1395.2, + "completions/max_terminated_length": 1395.2, + "completions/mean_length": 1025.321875, + "completions/mean_terminated_length": 1025.321875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "entropy": 0.2446480482816696, + "epoch": 2.602820211515864, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6485159993171692, + "learning_rate": 2.472740489459656e-07, + "loss": 0.0036, + "num_tokens": 298428219.0, + "reward": 0.7654687643051148, + "reward_std": 0.07041542753577232, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7654687643051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31836703717708587, + "sampling/importance_sampling_ratio/max": 1.929987335205078, + "sampling/importance_sampling_ratio/mean": 0.9999933481216431, + "sampling/importance_sampling_ratio/min": 0.37912967801094055, + "sampling/sampling_logp_difference/max": 0.9853403449058533, + "sampling/sampling_logp_difference/mean": 0.012882444821298123, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1636.4, + "completions/max_terminated_length": 1636.4, + "completions/mean_length": 1112.94375, + "completions/mean_terminated_length": 1112.94375, + "completions/min_length": 722.4, + "completions/min_terminated_length": 722.4, + "entropy": 0.24474719762802125, + "epoch": 2.608695652173913, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7990810871124268, + "learning_rate": 2.46668282045069e-07, + "loss": -0.0006, + "num_tokens": 299120297.0, + "reward": 0.7864583611488343, + "reward_std": 0.10424329489469528, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7864583611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.305034938454628, + "sampling/importance_sampling_ratio/max": 1.9024056434631347, + "sampling/importance_sampling_ratio/mean": 0.9999809026718139, + "sampling/importance_sampling_ratio/min": 0.2755643067397159, + "sampling/sampling_logp_difference/max": 3.7205732345581053, + "sampling/sampling_logp_difference/mean": 0.013068239949643613, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1734.8, + "completions/max_terminated_length": 1708.6, + "completions/mean_length": 1183.4, + "completions/mean_terminated_length": 1179.7350830078126, + "completions/min_length": 819.6, + "completions/min_terminated_length": 819.6, + "entropy": 0.259357213973999, + "epoch": 2.6145710928319623, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7435207366943359, + "learning_rate": 2.460625151441725e-07, + "loss": -0.0046, + "num_tokens": 299801477.0, + "reward": 0.7505208492279053, + "reward_std": 0.13032824955880642, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7505208492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29717023074626925, + "sampling/importance_sampling_ratio/max": 1.993881344795227, + "sampling/importance_sampling_ratio/mean": 1.000022292137146, + "sampling/importance_sampling_ratio/min": 0.3933206915855408, + "sampling/sampling_logp_difference/max": 1.1224096775054933, + "sampling/sampling_logp_difference/mean": 0.013361809588968755, + "step": 2225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1441.4, + "completions/max_terminated_length": 1441.4, + "completions/mean_length": 1047.221875, + "completions/mean_terminated_length": 1047.221875, + "completions/min_length": 758.6, + "completions/min_terminated_length": 758.6, + "entropy": 0.2406633496284485, + "epoch": 2.6204465334900116, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.41060614585876465, + "learning_rate": 2.45456748243276e-07, + "loss": 0.0021, + "num_tokens": 300458348.0, + "reward": 0.8791666746139526, + "reward_std": 0.06388028524816036, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8791666746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22914170026779174, + "sampling/importance_sampling_ratio/max": 1.9028963804244996, + "sampling/importance_sampling_ratio/mean": 0.9999442934989929, + "sampling/importance_sampling_ratio/min": 0.3476558208465576, + "sampling/sampling_logp_difference/max": 1.069634747505188, + "sampling/sampling_logp_difference/mean": 0.01292272675782442, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1604.2, + "completions/max_terminated_length": 1604.2, + "completions/mean_length": 1084.35, + "completions/mean_terminated_length": 1084.35, + "completions/min_length": 794.8, + "completions/min_terminated_length": 794.8, + "entropy": 0.24303655624389647, + "epoch": 2.6263219741480612, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7626137137413025, + "learning_rate": 2.4485098134237944e-07, + "loss": 0.0031, + "num_tokens": 301145692.0, + "reward": 0.775000023841858, + "reward_std": 0.14809788316488265, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7750000178813934, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32100625038146974, + "sampling/importance_sampling_ratio/max": 1.8840370416641234, + "sampling/importance_sampling_ratio/mean": 1.0000063896179199, + "sampling/importance_sampling_ratio/min": 0.31093878746032716, + "sampling/sampling_logp_difference/max": 1.2778079032897949, + "sampling/sampling_logp_difference/mean": 0.013110420294106006, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1503.6, + "completions/max_terminated_length": 1503.6, + "completions/mean_length": 1122.2, + "completions/mean_terminated_length": 1122.2, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "entropy": 0.2503249257802963, + "epoch": 2.6321974148061105, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7380458116531372, + "learning_rate": 2.442452144414829e-07, + "loss": 0.0018, + "num_tokens": 301798796.0, + "reward": 0.8723958492279053, + "reward_std": 0.08415319249033928, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8723958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2061576321721077, + "sampling/importance_sampling_ratio/max": 1.8270782232284546, + "sampling/importance_sampling_ratio/mean": 0.9999822735786438, + "sampling/importance_sampling_ratio/min": 0.4466776907444, + "sampling/sampling_logp_difference/max": 0.9171277284622192, + "sampling/sampling_logp_difference/mean": 0.013088957034051418, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1467.0, + "completions/max_terminated_length": 1467.0, + "completions/mean_length": 1079.2375, + "completions/mean_terminated_length": 1079.2375, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "entropy": 0.24606316089630126, + "epoch": 2.6380728554641597, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.49479740858078003, + "learning_rate": 2.4363944754058636e-07, + "loss": 0.0042, + "num_tokens": 302497080.0, + "reward": 0.9192708492279053, + "reward_std": 0.09399299174547196, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9192708492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16631564050912856, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001073718070983, + "sampling/importance_sampling_ratio/min": 0.30234395563602445, + "sampling/sampling_logp_difference/max": 1.2849443435668946, + "sampling/sampling_logp_difference/mean": 0.01323620304465294, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1562.0, + "completions/max_terminated_length": 1562.0, + "completions/mean_length": 1161.11875, + "completions/mean_terminated_length": 1161.11875, + "completions/min_length": 900.0, + "completions/min_terminated_length": 900.0, + "entropy": 0.2619780212640762, + "epoch": 2.6439482961222094, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.48926234245300293, + "learning_rate": 2.4303368063968985e-07, + "loss": -0.0007, + "num_tokens": 303196526.0, + "reward": 0.7757812738418579, + "reward_std": 0.046535524725914004, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7757812738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.36423116475343703, + "sampling/importance_sampling_ratio/max": 1.9756156206130981, + "sampling/importance_sampling_ratio/mean": 1.000075590610504, + "sampling/importance_sampling_ratio/min": 0.34909204840660096, + "sampling/sampling_logp_difference/max": 1.066146230697632, + "sampling/sampling_logp_difference/mean": 0.013476391322910786, + "step": 2250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1581.4, + "completions/max_terminated_length": 1581.4, + "completions/mean_length": 1124.425, + "completions/mean_terminated_length": 1124.425, + "completions/min_length": 843.2, + "completions/min_terminated_length": 843.2, + "entropy": 0.24228012859821318, + "epoch": 2.6498237367802586, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 2.424279137387933e-07, + "loss": 0.0052, + "num_tokens": 303868134.0, + "reward": 0.9687500119209289, + "reward_std": 0.04924879372119904, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9687500119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.09995669350028039, + "sampling/importance_sampling_ratio/max": 1.896589422225952, + "sampling/importance_sampling_ratio/mean": 1.0000037789344787, + "sampling/importance_sampling_ratio/min": 0.38552327156066896, + "sampling/sampling_logp_difference/max": 1.0073142290115356, + "sampling/sampling_logp_difference/mean": 0.01276505459100008, + "step": 2255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1599.6, + "completions/max_terminated_length": 1599.6, + "completions/mean_length": 1126.265625, + "completions/mean_terminated_length": 1126.265625, + "completions/min_length": 868.2, + "completions/min_terminated_length": 868.2, + "entropy": 0.2533190757036209, + "epoch": 2.655699177438308, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.824459433555603, + "learning_rate": 2.418221468378968e-07, + "loss": 0.0002, + "num_tokens": 304553291.0, + "reward": 0.8223958611488342, + "reward_std": 0.12836966216564177, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8223958611488342, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2609886109828949, + "sampling/importance_sampling_ratio/max": 1.8384875535964966, + "sampling/importance_sampling_ratio/mean": 0.9999510169029235, + "sampling/importance_sampling_ratio/min": 0.4050338566303253, + "sampling/sampling_logp_difference/max": 0.9330639719963074, + "sampling/sampling_logp_difference/mean": 0.01324385330080986, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1628.8, + "completions/max_terminated_length": 1628.8, + "completions/mean_length": 1131.925, + "completions/mean_terminated_length": 1131.925, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.25666095316410065, + "epoch": 2.661574618096357, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6386078000068665, + "learning_rate": 2.412163799370002e-07, + "loss": 0.0038, + "num_tokens": 305229475.0, + "reward": 0.8148958563804627, + "reward_std": 0.11356194913387299, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8148958563804627, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25244076550006866, + "sampling/importance_sampling_ratio/max": 1.9641210556030273, + "sampling/importance_sampling_ratio/mean": 0.999966835975647, + "sampling/importance_sampling_ratio/min": 0.3801779314875603, + "sampling/sampling_logp_difference/max": 1.1193055629730224, + "sampling/sampling_logp_difference/mean": 0.01317644640803337, + "step": 2265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1700.0, + "completions/max_terminated_length": 1700.0, + "completions/mean_length": 1193.60625, + "completions/mean_terminated_length": 1193.60625, + "completions/min_length": 856.2, + "completions/min_terminated_length": 856.2, + "entropy": 0.2445121705532074, + "epoch": 2.6674500587544063, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.40095868706703186, + "learning_rate": 2.406106130361037e-07, + "loss": -0.0086, + "num_tokens": 305924837.0, + "reward": 0.8000520944595337, + "reward_std": 0.11730363368988037, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8000520944595337, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28921135514974594, + "sampling/importance_sampling_ratio/max": 1.9419026374816895, + "sampling/importance_sampling_ratio/mean": 0.9999643087387085, + "sampling/importance_sampling_ratio/min": 0.32038319408893584, + "sampling/sampling_logp_difference/max": 1.219898271560669, + "sampling/sampling_logp_difference/mean": 0.0127852413803339, + "step": 2270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1673.2, + "completions/max_terminated_length": 1673.2, + "completions/mean_length": 1131.45, + "completions/mean_terminated_length": 1131.45, + "completions/min_length": 824.4, + "completions/min_terminated_length": 824.4, + "entropy": 0.24599368572235109, + "epoch": 2.673325499412456, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.48403316736221313, + "learning_rate": 2.4000484613520714e-07, + "loss": 0.0089, + "num_tokens": 306580261.0, + "reward": 0.7893229305744172, + "reward_std": 0.07952911332249642, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7893229305744172, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27056502997875215, + "sampling/importance_sampling_ratio/max": 1.9567157745361328, + "sampling/importance_sampling_ratio/mean": 0.9999315857887268, + "sampling/importance_sampling_ratio/min": 0.3672967258840799, + "sampling/sampling_logp_difference/max": 1.4008118629455566, + "sampling/sampling_logp_difference/mean": 0.012783203460276126, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1547.6, + "completions/max_terminated_length": 1547.6, + "completions/mean_length": 1147.421875, + "completions/mean_terminated_length": 1147.421875, + "completions/min_length": 876.6, + "completions/min_terminated_length": 876.6, + "entropy": 0.2490744948387146, + "epoch": 2.6792009400705052, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.7384084463119507, + "learning_rate": 2.3939907923431063e-07, + "loss": -0.0021, + "num_tokens": 307250684.0, + "reward": 0.8109375, + "reward_std": 0.0812767967581749, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.810937511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27321686446666715, + "sampling/importance_sampling_ratio/max": 1.9557661771774293, + "sampling/importance_sampling_ratio/mean": 0.9999991416931152, + "sampling/importance_sampling_ratio/min": 0.3092913806438446, + "sampling/sampling_logp_difference/max": 1.2244453430175781, + "sampling/sampling_logp_difference/mean": 0.012955594807863235, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1552.8, + "completions/max_terminated_length": 1552.8, + "completions/mean_length": 1139.478125, + "completions/mean_terminated_length": 1139.478125, + "completions/min_length": 891.4, + "completions/min_terminated_length": 891.4, + "entropy": 0.24866257309913636, + "epoch": 2.6850763807285545, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5620933175086975, + "learning_rate": 2.387933123334141e-07, + "loss": 0.01, + "num_tokens": 307932517.0, + "reward": 0.8145833492279053, + "reward_std": 0.10678637623786927, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8145833492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2612167507410049, + "sampling/importance_sampling_ratio/max": 1.8827569007873535, + "sampling/importance_sampling_ratio/mean": 1.0000137686729431, + "sampling/importance_sampling_ratio/min": 0.3459669291973114, + "sampling/sampling_logp_difference/max": 1.0946119785308839, + "sampling/sampling_logp_difference/mean": 0.013137634284794331, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1475.8, + "completions/max_terminated_length": 1475.8, + "completions/mean_length": 1094.653125, + "completions/mean_terminated_length": 1094.653125, + "completions/min_length": 844.4, + "completions/min_terminated_length": 844.4, + "entropy": 0.24577432572841645, + "epoch": 2.690951821386604, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7518845796585083, + "learning_rate": 2.3818754543251755e-07, + "loss": -0.0043, + "num_tokens": 308583686.0, + "reward": 0.8395833492279052, + "reward_std": 0.09295275211334228, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8395833492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2516939163208008, + "sampling/importance_sampling_ratio/max": 1.9372399568557739, + "sampling/importance_sampling_ratio/mean": 0.9999632596969604, + "sampling/importance_sampling_ratio/min": 0.3853158295154572, + "sampling/sampling_logp_difference/max": 1.0470689058303833, + "sampling/sampling_logp_difference/mean": 0.012928933463990688, + "step": 2290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1571.6, + "completions/max_terminated_length": 1571.6, + "completions/mean_length": 1107.1375, + "completions/mean_terminated_length": 1107.1375, + "completions/min_length": 784.2, + "completions/min_terminated_length": 784.2, + "entropy": 0.237555655837059, + "epoch": 2.6968272620446534, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7750040292739868, + "learning_rate": 2.3758177853162102e-07, + "loss": -0.0012, + "num_tokens": 309254690.0, + "reward": 0.801927101612091, + "reward_std": 0.1355880841612816, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8019270896911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2576450608670712, + "sampling/importance_sampling_ratio/max": 1.963302206993103, + "sampling/importance_sampling_ratio/mean": 1.0001328825950622, + "sampling/importance_sampling_ratio/min": 0.3588021665811539, + "sampling/sampling_logp_difference/max": 1.0835482478141785, + "sampling/sampling_logp_difference/mean": 0.012560645118355751, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1334.6, + "completions/max_terminated_length": 1334.6, + "completions/mean_length": 1019.06875, + "completions/mean_terminated_length": 1019.06875, + "completions/min_length": 742.4, + "completions/min_terminated_length": 742.4, + "entropy": 0.2461823046207428, + "epoch": 2.7027027027027026, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4063376486301422, + "learning_rate": 2.3697601163072448e-07, + "loss": 0.0081, + "num_tokens": 309941384.0, + "reward": 0.7347396016120911, + "reward_std": 0.10757745876908302, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7347396016120911, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3722479432821274, + "sampling/importance_sampling_ratio/max": 1.9909217596054076, + "sampling/importance_sampling_ratio/mean": 0.9999774694442749, + "sampling/importance_sampling_ratio/min": 0.40027025938034055, + "sampling/sampling_logp_difference/max": 1.0413033485412597, + "sampling/sampling_logp_difference/mean": 0.013325695879757404, + "step": 2300 + }, + { + "epoch": 2.7027027027027026, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1500.96, + "eval_completions/max_terminated_length": 1500.96, + "eval_completions/mean_length": 1071.851875, + "eval_completions/mean_terminated_length": 1071.851875, + "eval_completions/min_length": 804.0, + "eval_completions/min_terminated_length": 804.0, + "eval_entropy": 0.24838149964809417, + "eval_frac_reward_zero_std": 0.57, + "eval_loss": 0.0029058277141302824, + "eval_num_tokens": 309941384.0, + "eval_reward": 0.7540104258060455, + "eval_reward_std": 0.08435056537389755, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7540104258060455, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3009152537584305, + "eval_runtime": 409.9966, + "eval_samples_per_second": 0.244, + "eval_sampling/importance_sampling_ratio/max": 1.9231956005096436, + "eval_sampling/importance_sampling_ratio/mean": 1.0000133728981018, + "eval_sampling/importance_sampling_ratio/min": 0.3962679693102837, + "eval_sampling/sampling_logp_difference/max": 1.0625820803642272, + "eval_sampling/sampling_logp_difference/mean": 0.013145512826740742, + "eval_steps_per_second": 0.005, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1685.8, + "completions/max_terminated_length": 1685.8, + "completions/mean_length": 1141.159375, + "completions/mean_terminated_length": 1141.159375, + "completions/min_length": 790.2, + "completions/min_terminated_length": 790.2, + "entropy": 0.24258246421813964, + "epoch": 2.708578143360752, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5708547234535217, + "learning_rate": 2.3637024472982794e-07, + "loss": 0.01, + "num_tokens": 310627675.0, + "reward": 0.7439583539962769, + "reward_std": 0.08287105187773705, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7439583539962769, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3338875025510788, + "sampling/importance_sampling_ratio/max": 1.9743703126907348, + "sampling/importance_sampling_ratio/mean": 1.0000213861465455, + "sampling/importance_sampling_ratio/min": 0.3426578164100647, + "sampling/sampling_logp_difference/max": 1.1372323036193848, + "sampling/sampling_logp_difference/mean": 0.012876101024448871, + "step": 2305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1322.8, + "completions/max_terminated_length": 1322.8, + "completions/mean_length": 1040.896875, + "completions/mean_terminated_length": 1040.896875, + "completions/min_length": 783.8, + "completions/min_terminated_length": 783.8, + "entropy": 0.23975794315338134, + "epoch": 2.7144535840188015, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.45213189721107483, + "learning_rate": 2.357644778289314e-07, + "loss": 0.0011, + "num_tokens": 311274810.0, + "reward": 0.7901041746139527, + "reward_std": 0.09313196986913681, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7901041865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30386063158512117, + "sampling/importance_sampling_ratio/max": 1.8083451986312866, + "sampling/importance_sampling_ratio/mean": 0.9999869465827942, + "sampling/importance_sampling_ratio/min": 0.30560941696166993, + "sampling/sampling_logp_difference/max": 1.2987362384796142, + "sampling/sampling_logp_difference/mean": 0.012629561126232147, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1463.6, + "completions/max_terminated_length": 1463.6, + "completions/mean_length": 1084.85625, + "completions/mean_terminated_length": 1084.85625, + "completions/min_length": 815.6, + "completions/min_terminated_length": 815.6, + "entropy": 0.23631844222545623, + "epoch": 2.720329024676851, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7602788209915161, + "learning_rate": 2.3515871092803487e-07, + "loss": 0.0058, + "num_tokens": 311959612.0, + "reward": 0.9205729246139527, + "reward_std": 0.07825153470039367, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9205729246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1699974089860916, + "sampling/importance_sampling_ratio/max": 1.9587501287460327, + "sampling/importance_sampling_ratio/mean": 0.9999679207801819, + "sampling/importance_sampling_ratio/min": 0.47454640865325926, + "sampling/sampling_logp_difference/max": 0.770689058303833, + "sampling/sampling_logp_difference/mean": 0.012636097148060799, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1321.8, + "completions/max_terminated_length": 1321.8, + "completions/mean_length": 1015.790625, + "completions/mean_terminated_length": 1015.790625, + "completions/min_length": 776.4, + "completions/min_terminated_length": 776.4, + "entropy": 0.225640469789505, + "epoch": 2.7262044653349, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 2.3455294402713836e-07, + "loss": 0.0027, + "num_tokens": 312586553.0, + "reward": 0.8807291865348816, + "reward_std": 0.040735363215208056, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8807291865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21484533101320266, + "sampling/importance_sampling_ratio/max": 1.9278332471847535, + "sampling/importance_sampling_ratio/mean": 0.9999934673309326, + "sampling/importance_sampling_ratio/min": 0.41382956355810163, + "sampling/sampling_logp_difference/max": 1.2142931938171386, + "sampling/sampling_logp_difference/mean": 0.012177078425884247, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1457.6, + "completions/max_terminated_length": 1457.6, + "completions/mean_length": 1075.2, + "completions/mean_terminated_length": 1075.2, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "entropy": 0.21975458264350892, + "epoch": 2.7320799059929497, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6667649745941162, + "learning_rate": 2.3394717712624182e-07, + "loss": 0.0042, + "num_tokens": 313238793.0, + "reward": 0.8770833373069763, + "reward_std": 0.06933515965938568, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8770833492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20621103495359422, + "sampling/importance_sampling_ratio/max": 1.941676425933838, + "sampling/importance_sampling_ratio/mean": 0.9999809026718139, + "sampling/importance_sampling_ratio/min": 0.3605471342802048, + "sampling/sampling_logp_difference/max": 1.1387160539627075, + "sampling/sampling_logp_difference/mean": 0.011648139916360378, + "step": 2325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1666.6, + "completions/max_terminated_length": 1666.6, + "completions/mean_length": 1107.15625, + "completions/mean_terminated_length": 1107.15625, + "completions/min_length": 758.4, + "completions/min_terminated_length": 758.4, + "entropy": 0.22798166275024415, + "epoch": 2.737955346650999, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6299868822097778, + "learning_rate": 2.3334141022534528e-07, + "loss": 0.0092, + "num_tokens": 313900027.0, + "reward": 0.8026041865348816, + "reward_std": 0.1279518723487854, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8026041865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.282573002576828, + "sampling/importance_sampling_ratio/max": 1.9123780488967896, + "sampling/importance_sampling_ratio/mean": 0.9999549150466919, + "sampling/importance_sampling_ratio/min": 0.32881303429603576, + "sampling/sampling_logp_difference/max": 1.1452240943908691, + "sampling/sampling_logp_difference/mean": 0.012409896217286586, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 1078.26875, + "completions/mean_terminated_length": 1078.26875, + "completions/min_length": 853.6, + "completions/min_terminated_length": 853.6, + "entropy": 0.2364683359861374, + "epoch": 2.743830787309048, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4049372673034668, + "learning_rate": 2.3273564332444875e-07, + "loss": -0.0026, + "num_tokens": 314558865.0, + "reward": 0.9072916746139527, + "reward_std": 0.06933557838201523, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9072916746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20599967539310454, + "sampling/importance_sampling_ratio/max": 1.9302043199539185, + "sampling/importance_sampling_ratio/mean": 0.9999824285507202, + "sampling/importance_sampling_ratio/min": 0.26731246411800386, + "sampling/sampling_logp_difference/max": 1.3574584007263184, + "sampling/sampling_logp_difference/mean": 0.012622298114001751, + "step": 2335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1699.6, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 1115.8625, + "completions/mean_terminated_length": 1102.2861083984376, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "entropy": 0.23917319178581237, + "epoch": 2.7497062279670974, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.659565269947052, + "learning_rate": 2.3212987642355218e-07, + "loss": -0.0083, + "num_tokens": 315235353.0, + "reward": 0.7572916686534882, + "reward_std": 0.1146910235285759, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7572916686534882, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27366127967834475, + "sampling/importance_sampling_ratio/max": 1.9613188266754151, + "sampling/importance_sampling_ratio/mean": 1.000109815597534, + "sampling/importance_sampling_ratio/min": 0.40567088723182676, + "sampling/sampling_logp_difference/max": 0.9738764047622681, + "sampling/sampling_logp_difference/mean": 0.012759437412023544, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1602.6, + "completions/max_terminated_length": 1490.0, + "completions/mean_length": 1066.584375, + "completions/mean_terminated_length": 1062.37412109375, + "completions/min_length": 722.6, + "completions/min_terminated_length": 722.6, + "entropy": 0.23375988602638245, + "epoch": 2.7555816686251466, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 2.3152410952265567e-07, + "loss": 0.007, + "num_tokens": 315908512.0, + "reward": 0.8244791746139526, + "reward_std": 0.10080392360687256, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8244791746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2682786226272583, + "sampling/importance_sampling_ratio/max": 1.8856647253036498, + "sampling/importance_sampling_ratio/mean": 1.0000274181365967, + "sampling/importance_sampling_ratio/min": 0.3129617631435394, + "sampling/sampling_logp_difference/max": 1.3798688173294067, + "sampling/sampling_logp_difference/mean": 0.012719161063432693, + "step": 2345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1614.8, + "completions/max_terminated_length": 1614.8, + "completions/mean_length": 1075.425, + "completions/mean_terminated_length": 1075.425, + "completions/min_length": 761.6, + "completions/min_terminated_length": 761.6, + "entropy": 0.24059076011180877, + "epoch": 2.7614571092831963, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6796007752418518, + "learning_rate": 2.3091834262175914e-07, + "loss": 0.0106, + "num_tokens": 316558984.0, + "reward": 0.7855208516120911, + "reward_std": 0.08973032981157303, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7855208516120911, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28483576476573946, + "sampling/importance_sampling_ratio/max": 1.9070019960403441, + "sampling/importance_sampling_ratio/mean": 1.0000741243362428, + "sampling/importance_sampling_ratio/min": 0.23457692796364427, + "sampling/sampling_logp_difference/max": 2.6379079103469847, + "sampling/sampling_logp_difference/mean": 0.013021462410688401, + "step": 2350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1548.4, + "completions/max_terminated_length": 1519.2, + "completions/mean_length": 1059.915625, + "completions/mean_terminated_length": 1039.7633544921875, + "completions/min_length": 782.6, + "completions/min_terminated_length": 782.6, + "entropy": 0.22516947686672212, + "epoch": 2.7673325499412456, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4491136074066162, + "learning_rate": 2.303125757208626e-07, + "loss": -0.0128, + "num_tokens": 317209929.0, + "reward": 0.8020833373069763, + "reward_std": 0.06487823724746704, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8020833373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3081536591053009, + "sampling/importance_sampling_ratio/max": 1.9772464275360107, + "sampling/importance_sampling_ratio/mean": 1.000032651424408, + "sampling/importance_sampling_ratio/min": 0.38993417248129847, + "sampling/sampling_logp_difference/max": 1.3113280177116393, + "sampling/sampling_logp_difference/mean": 0.012163999117910862, + "step": 2355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1625.2, + "completions/max_terminated_length": 1625.2, + "completions/mean_length": 1096.434375, + "completions/mean_terminated_length": 1096.434375, + "completions/min_length": 822.6, + "completions/min_terminated_length": 822.6, + "entropy": 0.23812492191791534, + "epoch": 2.773207990599295, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4981617033481598, + "learning_rate": 2.2970680881996606e-07, + "loss": 0.0126, + "num_tokens": 317890724.0, + "reward": 0.8398437619209289, + "reward_std": 0.10197720378637314, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8398437619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2605766087770462, + "sampling/importance_sampling_ratio/max": 1.9663613796234132, + "sampling/importance_sampling_ratio/mean": 0.9999598026275635, + "sampling/importance_sampling_ratio/min": 0.3322810932993889, + "sampling/sampling_logp_difference/max": 1.3144399881362916, + "sampling/sampling_logp_difference/mean": 0.012650839053094387, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1339.6, + "completions/max_terminated_length": 1339.6, + "completions/mean_length": 995.4, + "completions/mean_terminated_length": 995.4, + "completions/min_length": 715.8, + "completions/min_terminated_length": 715.8, + "entropy": 0.21778804659843445, + "epoch": 2.7790834312573445, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.8730181455612183, + "learning_rate": 2.2910104191906955e-07, + "loss": -0.0016, + "num_tokens": 318524436.0, + "reward": 0.9052083492279053, + "reward_std": 0.06153279021382332, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9052083492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18928753733634948, + "sampling/importance_sampling_ratio/max": 1.8762676239013671, + "sampling/importance_sampling_ratio/mean": 0.9999732494354248, + "sampling/importance_sampling_ratio/min": 0.3732657790184021, + "sampling/sampling_logp_difference/max": 1.1048237800598144, + "sampling/sampling_logp_difference/mean": 0.011777786910533905, + "step": 2365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1410.8, + "completions/max_terminated_length": 1410.8, + "completions/mean_length": 991.55, + "completions/mean_terminated_length": 991.55, + "completions/min_length": 743.2, + "completions/min_terminated_length": 743.2, + "entropy": 0.2386650711297989, + "epoch": 2.7849588719153937, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5324167013168335, + "learning_rate": 2.28495275018173e-07, + "loss": -0.0004, + "num_tokens": 319173860.0, + "reward": 0.8041666746139526, + "reward_std": 0.08360731303691864, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8041666746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23412725180387498, + "sampling/importance_sampling_ratio/max": 1.9656662702560426, + "sampling/importance_sampling_ratio/mean": 0.9999577879905701, + "sampling/importance_sampling_ratio/min": 0.4060625612735748, + "sampling/sampling_logp_difference/max": 1.150398063659668, + "sampling/sampling_logp_difference/mean": 0.012849260680377483, + "step": 2370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1420.6, + "completions/max_terminated_length": 1420.6, + "completions/mean_length": 1008.525, + "completions/mean_terminated_length": 1008.525, + "completions/min_length": 721.2, + "completions/min_terminated_length": 721.2, + "entropy": 0.21963294446468354, + "epoch": 2.790834312573443, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6863450407981873, + "learning_rate": 2.2788950811727648e-07, + "loss": 0.0011, + "num_tokens": 319832220.0, + "reward": 0.7083333492279053, + "reward_std": 0.0958444319665432, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7083333492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34578675627708433, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9998778700828552, + "sampling/importance_sampling_ratio/min": 0.2893254727125168, + "sampling/sampling_logp_difference/max": 1.3027319669723512, + "sampling/sampling_logp_difference/mean": 0.012292330339550971, + "step": 2375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1497.6, + "completions/max_terminated_length": 1497.6, + "completions/mean_length": 1102.7625, + "completions/mean_terminated_length": 1102.7625, + "completions/min_length": 832.6, + "completions/min_terminated_length": 832.6, + "entropy": 0.2415948212146759, + "epoch": 2.796709753231492, + "frac_reward_zero_std": 0.45, + "grad_norm": 1.0058459043502808, + "learning_rate": 2.272837412163799e-07, + "loss": 0.0055, + "num_tokens": 320522224.0, + "reward": 0.8317708373069763, + "reward_std": 0.1039445236325264, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8317708373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2156577706336975, + "sampling/importance_sampling_ratio/max": 1.988040065765381, + "sampling/importance_sampling_ratio/mean": 0.9999471068382263, + "sampling/importance_sampling_ratio/min": 0.3297655165195465, + "sampling/sampling_logp_difference/max": 1.2059980869293212, + "sampling/sampling_logp_difference/mean": 0.012710276432335377, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1576.6, + "completions/max_terminated_length": 1576.6, + "completions/mean_length": 1079.4, + "completions/mean_terminated_length": 1079.4, + "completions/min_length": 729.8, + "completions/min_terminated_length": 729.8, + "entropy": 0.24798661470413208, + "epoch": 2.802585193889542, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6440450549125671, + "learning_rate": 2.2667797431548338e-07, + "loss": 0.0003, + "num_tokens": 321186496.0, + "reward": 0.7979166746139527, + "reward_std": 0.073116684705019, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7979166865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2511927545070648, + "sampling/importance_sampling_ratio/max": 1.9184043169021607, + "sampling/importance_sampling_ratio/mean": 0.9999388098716736, + "sampling/importance_sampling_ratio/min": 0.3903336763381958, + "sampling/sampling_logp_difference/max": 0.9640900135040283, + "sampling/sampling_logp_difference/mean": 0.012855613417923451, + "step": 2385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1510.8, + "completions/max_terminated_length": 1510.8, + "completions/mean_length": 1024.43125, + "completions/mean_terminated_length": 1024.43125, + "completions/min_length": 795.2, + "completions/min_terminated_length": 795.2, + "entropy": 0.24828683137893676, + "epoch": 2.808460634547591, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4777759909629822, + "learning_rate": 2.2607220741458686e-07, + "loss": 0.0043, + "num_tokens": 321826602.0, + "reward": 0.8531250357627869, + "reward_std": 0.09283200576901436, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8531250357627869, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2510357365012169, + "sampling/importance_sampling_ratio/max": 1.9304353952407838, + "sampling/importance_sampling_ratio/mean": 1.0000390410423279, + "sampling/importance_sampling_ratio/min": 0.4463606238365173, + "sampling/sampling_logp_difference/max": 0.9297639846801757, + "sampling/sampling_logp_difference/mean": 0.013255661353468895, + "step": 2390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.8, + "completions/max_terminated_length": 1558.8, + "completions/mean_length": 1120.89375, + "completions/mean_terminated_length": 1120.89375, + "completions/min_length": 806.4, + "completions/min_terminated_length": 806.4, + "entropy": 0.24372271895408631, + "epoch": 2.8143360752056403, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7181718945503235, + "learning_rate": 2.2546644051369033e-07, + "loss": 0.0033, + "num_tokens": 322511752.0, + "reward": 0.8140625119209289, + "reward_std": 0.10003995001316071, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8140625119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20417188704013825, + "sampling/importance_sampling_ratio/max": 1.9468562364578248, + "sampling/importance_sampling_ratio/mean": 0.9999962449073792, + "sampling/importance_sampling_ratio/min": 0.3644874632358551, + "sampling/sampling_logp_difference/max": 1.0251569509506226, + "sampling/sampling_logp_difference/mean": 0.012897053360939026, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1386.2, + "completions/max_terminated_length": 1386.2, + "completions/mean_length": 1072.1, + "completions/mean_terminated_length": 1072.1, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.23362427949905396, + "epoch": 2.82021151586369, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.3538127839565277, + "learning_rate": 2.248606736127938e-07, + "loss": 0.0005, + "num_tokens": 323181272.0, + "reward": 0.928697919845581, + "reward_std": 0.033222814276814464, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.928697919845581, + "rewards/e2e_recall_precision_mixed_reward/std": 0.12631498724222184, + "sampling/importance_sampling_ratio/max": 1.8217214107513429, + "sampling/importance_sampling_ratio/mean": 0.9999819278717041, + "sampling/importance_sampling_ratio/min": 0.35696284770965575, + "sampling/sampling_logp_difference/max": 1.0873285770416259, + "sampling/sampling_logp_difference/mean": 0.012447315640747547, + "step": 2400 + }, + { + "epoch": 2.82021151586369, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1521.0, + "eval_completions/max_terminated_length": 1521.0, + "eval_completions/mean_length": 1084.185, + "eval_completions/mean_terminated_length": 1084.185, + "eval_completions/min_length": 813.92, + "eval_completions/min_terminated_length": 813.92, + "eval_entropy": 0.25041636466979983, + "eval_frac_reward_zero_std": 0.62, + "eval_loss": 0.0041066440753638744, + "eval_num_tokens": 323181272.0, + "eval_reward": 0.7530312585830689, + "eval_reward_std": 0.0783327068388462, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7530312597751617, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2980782437324524, + "eval_runtime": 415.7277, + "eval_samples_per_second": 0.241, + "eval_sampling/importance_sampling_ratio/max": 1.9338807630538941, + "eval_sampling/importance_sampling_ratio/mean": 1.0000006413459779, + "eval_sampling/importance_sampling_ratio/min": 0.32921223118901255, + "eval_sampling/sampling_logp_difference/max": 1.2984151482582091, + "eval_sampling/sampling_logp_difference/mean": 0.013157993406057358, + "eval_steps_per_second": 0.005, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1988.6, + "completions/max_terminated_length": 1988.6, + "completions/mean_length": 1143.284375, + "completions/mean_terminated_length": 1143.284375, + "completions/min_length": 789.2, + "completions/min_terminated_length": 789.2, + "entropy": 0.25749709010124205, + "epoch": 2.8260869565217392, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5876041054725647, + "learning_rate": 2.2425490671189725e-07, + "loss": 0.0072, + "num_tokens": 323895763.0, + "reward": 0.795677101612091, + "reward_std": 0.10413843393325806, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.795677101612091, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3029506832361221, + "sampling/importance_sampling_ratio/max": 1.9795888662338257, + "sampling/importance_sampling_ratio/mean": 1.0000039696693421, + "sampling/importance_sampling_ratio/min": 0.3718868136405945, + "sampling/sampling_logp_difference/max": 1.1153724908828735, + "sampling/sampling_logp_difference/mean": 0.013322078436613084, + "step": 2405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.4, + "completions/max_terminated_length": 1622.4, + "completions/mean_length": 1119.771875, + "completions/mean_terminated_length": 1119.771875, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "entropy": 0.2605073541402817, + "epoch": 2.8319623971797885, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.5048226118087769, + "learning_rate": 2.2364913981100072e-07, + "loss": -0.0009, + "num_tokens": 324567498.0, + "reward": 0.7794270873069763, + "reward_std": 0.09945255517959595, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7794270992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29298948049545287, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999875545501709, + "sampling/importance_sampling_ratio/min": 0.3045585220679641, + "sampling/sampling_logp_difference/max": 1.7741375207901, + "sampling/sampling_logp_difference/mean": 0.013549309782683849, + "step": 2410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1547.8, + "completions/max_terminated_length": 1547.8, + "completions/mean_length": 1117.696875, + "completions/mean_terminated_length": 1117.696875, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "entropy": 0.2522917926311493, + "epoch": 2.8378378378378377, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5383021831512451, + "learning_rate": 2.230433729101042e-07, + "loss": -0.003, + "num_tokens": 325233065.0, + "reward": 0.7588541746139527, + "reward_std": 0.07800202667713166, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7588541746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3657463490962982, + "sampling/importance_sampling_ratio/max": 1.9869714260101319, + "sampling/importance_sampling_ratio/mean": 0.9999585270881652, + "sampling/importance_sampling_ratio/min": 0.33153983354568484, + "sampling/sampling_logp_difference/max": 1.1554431915283203, + "sampling/sampling_logp_difference/mean": 0.013245697319507598, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1408.0, + "completions/max_terminated_length": 1408.0, + "completions/mean_length": 1086.828125, + "completions/mean_terminated_length": 1086.828125, + "completions/min_length": 823.2, + "completions/min_terminated_length": 823.2, + "entropy": 0.24877199530601501, + "epoch": 2.843713278495887, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7546255588531494, + "learning_rate": 2.2243760600920764e-07, + "loss": 0.0012, + "num_tokens": 325882322.0, + "reward": 0.7520833492279053, + "reward_std": 0.08643076345324516, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7520833492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2982165992259979, + "sampling/importance_sampling_ratio/max": 1.9108956575393676, + "sampling/importance_sampling_ratio/mean": 0.9998527646064759, + "sampling/importance_sampling_ratio/min": 0.34085713028907777, + "sampling/sampling_logp_difference/max": 1.1262581586837768, + "sampling/sampling_logp_difference/mean": 0.01293862983584404, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1657.4, + "completions/max_terminated_length": 1657.4, + "completions/mean_length": 1171.2, + "completions/mean_terminated_length": 1171.2, + "completions/min_length": 850.6, + "completions/min_terminated_length": 850.6, + "entropy": 0.24384477734565735, + "epoch": 2.8495887191539366, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 2.218318391083111e-07, + "loss": -0.002, + "num_tokens": 326571378.0, + "reward": 0.717187511920929, + "reward_std": 0.06420539878308773, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.717187511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.334746053814888, + "sampling/importance_sampling_ratio/max": 1.934575629234314, + "sampling/importance_sampling_ratio/mean": 1.0000715255737305, + "sampling/importance_sampling_ratio/min": 0.3990495681762695, + "sampling/sampling_logp_difference/max": 1.0094342708587647, + "sampling/sampling_logp_difference/mean": 0.012629887461662293, + "step": 2425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1639.8, + "completions/max_terminated_length": 1639.8, + "completions/mean_length": 1121.29375, + "completions/mean_terminated_length": 1121.29375, + "completions/min_length": 830.0, + "completions/min_terminated_length": 830.0, + "entropy": 0.24283508062362671, + "epoch": 2.855464159811986, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.46253702044487, + "learning_rate": 2.2122607220741457e-07, + "loss": 0.0014, + "num_tokens": 327278640.0, + "reward": 0.7898437857627869, + "reward_std": 0.0780556008219719, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7898437857627869, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2904230237007141, + "sampling/importance_sampling_ratio/max": 1.982173752784729, + "sampling/importance_sampling_ratio/mean": 1.00010347366333, + "sampling/importance_sampling_ratio/min": 0.3133694648742676, + "sampling/sampling_logp_difference/max": 1.2216663122177125, + "sampling/sampling_logp_difference/mean": 0.01267168838530779, + "step": 2430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1457.4, + "completions/max_terminated_length": 1457.4, + "completions/mean_length": 1109.309375, + "completions/mean_terminated_length": 1109.309375, + "completions/min_length": 809.2, + "completions/min_terminated_length": 809.2, + "entropy": 0.2510025084018707, + "epoch": 2.861339600470035, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6064744591712952, + "learning_rate": 2.2062030530651803e-07, + "loss": 0.0057, + "num_tokens": 327965651.0, + "reward": 0.85703125, + "reward_std": 0.10422002114355564, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8570312619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24693194925785064, + "sampling/importance_sampling_ratio/max": 1.9674418687820434, + "sampling/importance_sampling_ratio/mean": 1.0000630259513854, + "sampling/importance_sampling_ratio/min": 0.3229108899831772, + "sampling/sampling_logp_difference/max": 1.1470834374427796, + "sampling/sampling_logp_difference/mean": 0.012952681444585324, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1498.8, + "completions/max_terminated_length": 1498.8, + "completions/mean_length": 1084.78125, + "completions/mean_terminated_length": 1084.78125, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "entropy": 0.25026600658893583, + "epoch": 2.867215041128085, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4441192150115967, + "learning_rate": 2.2001453840562152e-07, + "loss": 0.0012, + "num_tokens": 328652029.0, + "reward": 0.8427083373069764, + "reward_std": 0.08037955164909363, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8427083373069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2896536558866501, + "sampling/importance_sampling_ratio/max": 1.9711972713470458, + "sampling/importance_sampling_ratio/mean": 0.9999094724655151, + "sampling/importance_sampling_ratio/min": 0.3898857295513153, + "sampling/sampling_logp_difference/max": 1.0192480564117432, + "sampling/sampling_logp_difference/mean": 0.013036524504423141, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1677.2, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 1131.771875, + "completions/mean_terminated_length": 1123.9448486328124, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "entropy": 0.2522565394639969, + "epoch": 2.873090481786134, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.49075251817703247, + "learning_rate": 2.1940877150472498e-07, + "loss": -0.0113, + "num_tokens": 329330124.0, + "reward": 0.8895833492279053, + "reward_std": 0.07341724410653114, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8895833492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2047812134027481, + "sampling/importance_sampling_ratio/max": 1.9501996517181397, + "sampling/importance_sampling_ratio/mean": 0.9999414086341858, + "sampling/importance_sampling_ratio/min": 0.28733372688293457, + "sampling/sampling_logp_difference/max": 1.2867217302322387, + "sampling/sampling_logp_difference/mean": 0.013158978708088399, + "step": 2445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1515.2, + "completions/max_terminated_length": 1515.2, + "completions/mean_length": 1143.65625, + "completions/mean_terminated_length": 1143.65625, + "completions/min_length": 856.6, + "completions/min_terminated_length": 856.6, + "entropy": 0.26105785369873047, + "epoch": 2.8789659224441833, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6251075863838196, + "learning_rate": 2.1880300460382845e-07, + "loss": -0.001, + "num_tokens": 330017470.0, + "reward": 0.8720833420753479, + "reward_std": 0.11039019525051116, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8720833420753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21222806125879287, + "sampling/importance_sampling_ratio/max": 1.8873302221298218, + "sampling/importance_sampling_ratio/mean": 1.0000155210494994, + "sampling/importance_sampling_ratio/min": 0.3039222886785865, + "sampling/sampling_logp_difference/max": 1.8534332752227782, + "sampling/sampling_logp_difference/mean": 0.013153030537068844, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 1747.0, + "completions/max_terminated_length": 1738.6, + "completions/mean_length": 1155.228125, + "completions/mean_terminated_length": 1139.845849609375, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.2519680917263031, + "epoch": 2.8848413631022325, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6380645036697388, + "learning_rate": 2.181972377029319e-07, + "loss": -0.0153, + "num_tokens": 330704599.0, + "reward": 0.8036458492279053, + "reward_std": 0.07958495393395423, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8036458492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24552056342363357, + "sampling/importance_sampling_ratio/max": 1.8710920572280885, + "sampling/importance_sampling_ratio/mean": 1.0000233888626098, + "sampling/importance_sampling_ratio/min": 0.3811956226825714, + "sampling/sampling_logp_difference/max": 1.0013225317001342, + "sampling/sampling_logp_difference/mean": 0.013104490749537945, + "step": 2455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1977.6, + "completions/max_terminated_length": 1977.6, + "completions/mean_length": 1219.80625, + "completions/mean_terminated_length": 1219.80625, + "completions/min_length": 826.2, + "completions/min_terminated_length": 826.2, + "entropy": 0.2533602148294449, + "epoch": 2.890716803760282, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6828557252883911, + "learning_rate": 2.1759147080203534e-07, + "loss": 0.0033, + "num_tokens": 331419417.0, + "reward": 0.8088541865348816, + "reward_std": 0.07977185398340225, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8088541865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3063382938504219, + "sampling/importance_sampling_ratio/max": 1.9880699634552002, + "sampling/importance_sampling_ratio/mean": 1.0000319957733155, + "sampling/importance_sampling_ratio/min": 0.3192105397582054, + "sampling/sampling_logp_difference/max": 1.4089489936828614, + "sampling/sampling_logp_difference/mean": 0.01299858596175909, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1555.4, + "completions/max_terminated_length": 1555.4, + "completions/mean_length": 1136.778125, + "completions/mean_terminated_length": 1136.778125, + "completions/min_length": 876.0, + "completions/min_terminated_length": 876.0, + "entropy": 0.2615731358528137, + "epoch": 2.8965922444183314, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5761390328407288, + "learning_rate": 2.1698570390113883e-07, + "loss": -0.0011, + "num_tokens": 332091554.0, + "reward": 0.8898437738418579, + "reward_std": 0.07263861447572709, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8898437738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19870344996452333, + "sampling/importance_sampling_ratio/max": 1.9780163526535035, + "sampling/importance_sampling_ratio/mean": 0.9998313546180725, + "sampling/importance_sampling_ratio/min": 0.4730712652206421, + "sampling/sampling_logp_difference/max": 0.795024037361145, + "sampling/sampling_logp_difference/mean": 0.013214756362140178, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1676.6, + "completions/max_terminated_length": 1676.6, + "completions/mean_length": 1146.296875, + "completions/mean_terminated_length": 1146.296875, + "completions/min_length": 832.2, + "completions/min_terminated_length": 832.2, + "entropy": 0.2526620090007782, + "epoch": 2.9024676850763806, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6532995104789734, + "learning_rate": 2.163799370002423e-07, + "loss": 0.0031, + "num_tokens": 332762497.0, + "reward": 0.9223958492279053, + "reward_std": 0.06299638226628304, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9223958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17120740562677383, + "sampling/importance_sampling_ratio/max": 1.9535242080688477, + "sampling/importance_sampling_ratio/mean": 1.0000123977661133, + "sampling/importance_sampling_ratio/min": 0.3405495584011078, + "sampling/sampling_logp_difference/max": 1.081800150871277, + "sampling/sampling_logp_difference/mean": 0.012900187820196151, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1644.6, + "completions/max_terminated_length": 1644.6, + "completions/mean_length": 1169.665625, + "completions/mean_terminated_length": 1169.665625, + "completions/min_length": 833.6, + "completions/min_terminated_length": 833.6, + "entropy": 0.26975159645080565, + "epoch": 2.9083431257344303, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.599644660949707, + "learning_rate": 2.1577417009934576e-07, + "loss": -0.0003, + "num_tokens": 333444758.0, + "reward": 0.8614583492279053, + "reward_std": 0.059494443237781525, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8614583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19280930310487748, + "sampling/importance_sampling_ratio/max": 1.9035891771316529, + "sampling/importance_sampling_ratio/mean": 1.0000111937522889, + "sampling/importance_sampling_ratio/min": 0.3698125422000885, + "sampling/sampling_logp_difference/max": 1.0941879034042359, + "sampling/sampling_logp_difference/mean": 0.0135704992339015, + "step": 2475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1623.8, + "completions/max_terminated_length": 1623.8, + "completions/mean_length": 1155.8375, + "completions/mean_terminated_length": 1155.8375, + "completions/min_length": 916.2, + "completions/min_terminated_length": 916.2, + "entropy": 0.26333553791046144, + "epoch": 2.9142185663924796, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5579608082771301, + "learning_rate": 2.1516840319844922e-07, + "loss": 0.0036, + "num_tokens": 334145586.0, + "reward": 0.8338541865348816, + "reward_std": 0.10666598826646805, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8338541865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27408269345760344, + "sampling/importance_sampling_ratio/max": 1.96576726436615, + "sampling/importance_sampling_ratio/mean": 1.0000550985336303, + "sampling/importance_sampling_ratio/min": 0.393053674697876, + "sampling/sampling_logp_difference/max": 1.2964228630065917, + "sampling/sampling_logp_difference/mean": 0.013295540772378444, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1590.0, + "completions/max_terminated_length": 1590.0, + "completions/mean_length": 1181.290625, + "completions/mean_terminated_length": 1181.290625, + "completions/min_length": 885.2, + "completions/min_terminated_length": 885.2, + "entropy": 0.2766443967819214, + "epoch": 2.920094007050529, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.44202736020088196, + "learning_rate": 2.1456263629755269e-07, + "loss": 0.0009, + "num_tokens": 334873375.0, + "reward": 0.7684895992279053, + "reward_std": 0.05524676963686943, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7684895992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25875370651483537, + "sampling/importance_sampling_ratio/max": 1.9369422912597656, + "sampling/importance_sampling_ratio/mean": 1.000017511844635, + "sampling/importance_sampling_ratio/min": 0.2905049294233322, + "sampling/sampling_logp_difference/max": 1.423995351791382, + "sampling/sampling_logp_difference/mean": 0.014015245065093041, + "step": 2485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1652.6, + "completions/mean_length": 1200.453125, + "completions/mean_terminated_length": 1190.051318359375, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.2852647066116333, + "epoch": 2.925969447708578, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6844410300254822, + "learning_rate": 2.1395686939665617e-07, + "loss": 0.0034, + "num_tokens": 335560532.0, + "reward": 0.6661458373069763, + "reward_std": 0.09234302788972855, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6661458373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.310054212808609, + "sampling/importance_sampling_ratio/max": 1.9192859172821044, + "sampling/importance_sampling_ratio/mean": 1.0000102877616883, + "sampling/importance_sampling_ratio/min": 0.35606696009635924, + "sampling/sampling_logp_difference/max": 1.0444335222244263, + "sampling/sampling_logp_difference/mean": 0.01423647254705429, + "step": 2490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1445.4, + "completions/max_terminated_length": 1445.4, + "completions/mean_length": 1093.390625, + "completions/mean_terminated_length": 1093.390625, + "completions/min_length": 831.8, + "completions/min_terminated_length": 831.8, + "entropy": 0.2477249264717102, + "epoch": 2.9318448883666273, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6126586198806763, + "learning_rate": 2.1335110249575964e-07, + "loss": 0.0008, + "num_tokens": 336230017.0, + "reward": 0.881250011920929, + "reward_std": 0.0689006544649601, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.881250011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21311764121055604, + "sampling/importance_sampling_ratio/max": 1.9315643072128297, + "sampling/importance_sampling_ratio/mean": 0.9998762845993042, + "sampling/importance_sampling_ratio/min": 0.4095799148082733, + "sampling/sampling_logp_difference/max": 0.9346026420593262, + "sampling/sampling_logp_difference/mean": 0.01287180297076702, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1656.2, + "completions/max_terminated_length": 1656.2, + "completions/mean_length": 1153.73125, + "completions/mean_terminated_length": 1153.73125, + "completions/min_length": 744.6, + "completions/min_terminated_length": 744.6, + "entropy": 0.25892970263957976, + "epoch": 2.937720329024677, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.699809193611145, + "learning_rate": 2.1274533559486307e-07, + "loss": -0.0013, + "num_tokens": 336941419.0, + "reward": 0.8398437738418579, + "reward_std": 0.13118309378623963, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8398437738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24315484762191772, + "sampling/importance_sampling_ratio/max": 1.9509620666503906, + "sampling/importance_sampling_ratio/mean": 0.9998958230018615, + "sampling/importance_sampling_ratio/min": 0.3090625017881393, + "sampling/sampling_logp_difference/max": 1.36893892288208, + "sampling/sampling_logp_difference/mean": 0.013196432776749135, + "step": 2500 + }, + { + "epoch": 2.937720329024677, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1615.92, + "eval_completions/max_terminated_length": 1615.92, + "eval_completions/mean_length": 1160.8625, + "eval_completions/mean_terminated_length": 1160.8625, + "eval_completions/min_length": 864.4, + "eval_completions/min_terminated_length": 864.4, + "eval_entropy": 0.2651230132579803, + "eval_frac_reward_zero_std": 0.57, + "eval_loss": 0.003797011449933052, + "eval_num_tokens": 336941419.0, + "eval_reward": 0.7690000116825104, + "eval_reward_std": 0.08354492157697678, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7690000140666962, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2940467694401741, + "eval_runtime": 438.4961, + "eval_samples_per_second": 0.228, + "eval_sampling/importance_sampling_ratio/max": 1.9412076902389526, + "eval_sampling/importance_sampling_ratio/mean": 1.0000221300125123, + "eval_sampling/importance_sampling_ratio/min": 0.35337373718619347, + "eval_sampling/sampling_logp_difference/max": 1.1907857728004456, + "eval_sampling/sampling_logp_difference/mean": 0.013423861749470235, + "eval_steps_per_second": 0.005, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1678.8, + "completions/max_terminated_length": 1678.8, + "completions/mean_length": 1160.890625, + "completions/mean_terminated_length": 1160.890625, + "completions/min_length": 871.8, + "completions/min_terminated_length": 871.8, + "entropy": 0.2552634745836258, + "epoch": 2.943595769682726, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.43133029341697693, + "learning_rate": 2.1213956869396654e-07, + "loss": 0.0021, + "num_tokens": 337630280.0, + "reward": 0.7451562523841858, + "reward_std": 0.058319534920156, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7451562523841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34460012018680575, + "sampling/importance_sampling_ratio/max": 1.9622292518615723, + "sampling/importance_sampling_ratio/mean": 0.9999729037284851, + "sampling/importance_sampling_ratio/min": 0.36249165832996366, + "sampling/sampling_logp_difference/max": 1.1442147254943849, + "sampling/sampling_logp_difference/mean": 0.01324941124767065, + "step": 2505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1671.0, + "completions/max_terminated_length": 1671.0, + "completions/mean_length": 1157.425, + "completions/mean_terminated_length": 1157.425, + "completions/min_length": 815.6, + "completions/min_terminated_length": 815.6, + "entropy": 0.26674684882164, + "epoch": 2.9494712103407754, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6723577380180359, + "learning_rate": 2.1153380179307e-07, + "loss": -0.0054, + "num_tokens": 338330224.0, + "reward": 0.8408854484558106, + "reward_std": 0.06934458911418914, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8408854484558106, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23293874263763428, + "sampling/importance_sampling_ratio/max": 1.9028401374816895, + "sampling/importance_sampling_ratio/mean": 1.0001383185386659, + "sampling/importance_sampling_ratio/min": 0.38664844036102297, + "sampling/sampling_logp_difference/max": 0.9693643093109131, + "sampling/sampling_logp_difference/mean": 0.01352162528783083, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1909.0, + "completions/max_terminated_length": 1860.6, + "completions/mean_length": 1282.859375, + "completions/mean_terminated_length": 1279.4164794921876, + "completions/min_length": 871.6, + "completions/min_terminated_length": 871.6, + "entropy": 0.2674229830503464, + "epoch": 2.955346650998825, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8589381575584412, + "learning_rate": 2.109280348921735e-07, + "loss": -0.0084, + "num_tokens": 339039711.0, + "reward": 0.7507812738418579, + "reward_std": 0.12396226227283477, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7507812738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.33741641640663145, + "sampling/importance_sampling_ratio/max": 1.9285744667053222, + "sampling/importance_sampling_ratio/mean": 1.0001076936721802, + "sampling/importance_sampling_ratio/min": 0.3456153243780136, + "sampling/sampling_logp_difference/max": 1.2050754070281982, + "sampling/sampling_logp_difference/mean": 0.013429709896445274, + "step": 2515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1685.0, + "completions/max_terminated_length": 1685.0, + "completions/mean_length": 1150.846875, + "completions/mean_terminated_length": 1150.846875, + "completions/min_length": 829.2, + "completions/min_terminated_length": 829.2, + "entropy": 0.2486409604549408, + "epoch": 2.9612220916568743, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5134599804878235, + "learning_rate": 2.1032226799127695e-07, + "loss": 0.0046, + "num_tokens": 339715694.0, + "reward": 0.840625011920929, + "reward_std": 0.045786444842815396, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.840625011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2133595198392868, + "sampling/importance_sampling_ratio/max": 1.9325678586959838, + "sampling/importance_sampling_ratio/mean": 1.000019907951355, + "sampling/importance_sampling_ratio/min": 0.41126868724822996, + "sampling/sampling_logp_difference/max": 0.9507953882217407, + "sampling/sampling_logp_difference/mean": 0.012637078016996383, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1533.4, + "completions/max_terminated_length": 1533.4, + "completions/mean_length": 1150.565625, + "completions/mean_terminated_length": 1150.565625, + "completions/min_length": 922.6, + "completions/min_terminated_length": 922.6, + "entropy": 0.2696438133716583, + "epoch": 2.9670975323149236, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6207783222198486, + "learning_rate": 2.0971650109038041e-07, + "loss": 0.0028, + "num_tokens": 340408995.0, + "reward": 0.8907291650772095, + "reward_std": 0.09921484291553498, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8907291650772095, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19189145267009736, + "sampling/importance_sampling_ratio/max": 1.9334681987762452, + "sampling/importance_sampling_ratio/mean": 0.9999927043914795, + "sampling/importance_sampling_ratio/min": 0.31438209041953086, + "sampling/sampling_logp_difference/max": 1.3856622934341432, + "sampling/sampling_logp_difference/mean": 0.013535279594361783, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1737.8, + "completions/max_terminated_length": 1737.8, + "completions/mean_length": 1185.38125, + "completions/mean_terminated_length": 1185.38125, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "entropy": 0.2622179836034775, + "epoch": 2.972972972972973, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 2.0911073418948388e-07, + "loss": 0.0044, + "num_tokens": 341100077.0, + "reward": 0.9062500119209289, + "reward_std": 0.07759927660226822, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9062500119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19565635025501252, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000108242034913, + "sampling/importance_sampling_ratio/min": 0.3265382140874863, + "sampling/sampling_logp_difference/max": 1.1886168718338013, + "sampling/sampling_logp_difference/mean": 0.013404231891036034, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1551.0, + "completions/max_terminated_length": 1551.0, + "completions/mean_length": 1126.5625, + "completions/mean_terminated_length": 1126.5625, + "completions/min_length": 860.8, + "completions/min_terminated_length": 860.8, + "entropy": 0.2481956660747528, + "epoch": 2.9788484136310225, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4216060936450958, + "learning_rate": 2.0850496728858734e-07, + "loss": -0.0042, + "num_tokens": 341751889.0, + "reward": 0.9169270992279053, + "reward_std": 0.07064950466156006, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9169270992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18254780471324922, + "sampling/importance_sampling_ratio/max": 1.964564847946167, + "sampling/importance_sampling_ratio/mean": 1.0000295996665955, + "sampling/importance_sampling_ratio/min": 0.3919122636318207, + "sampling/sampling_logp_difference/max": 1.0612175703048705, + "sampling/sampling_logp_difference/mean": 0.012986170686781406, + "step": 2535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1424.6, + "completions/max_terminated_length": 1424.6, + "completions/mean_length": 1110.55625, + "completions/mean_terminated_length": 1110.55625, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "entropy": 0.25095831155776976, + "epoch": 2.9847238542890717, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.42990821599960327, + "learning_rate": 2.0789920038769083e-07, + "loss": -0.0017, + "num_tokens": 342441411.0, + "reward": 0.9187500238418579, + "reward_std": 0.07234707698225976, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9187500238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1585363283753395, + "sampling/importance_sampling_ratio/max": 1.9233515501022338, + "sampling/importance_sampling_ratio/mean": 0.9999359726905823, + "sampling/importance_sampling_ratio/min": 0.31806144714355467, + "sampling/sampling_logp_difference/max": 1.177621603012085, + "sampling/sampling_logp_difference/mean": 0.013096613995730876, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1723.8, + "completions/max_terminated_length": 1723.8, + "completions/mean_length": 1173.43125, + "completions/mean_terminated_length": 1173.43125, + "completions/min_length": 891.2, + "completions/min_terminated_length": 891.2, + "entropy": 0.245199453830719, + "epoch": 2.990599294947121, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4453013837337494, + "learning_rate": 2.0729343348679427e-07, + "loss": 0.0001, + "num_tokens": 343163101.0, + "reward": 0.8479166746139526, + "reward_std": 0.07755421325564385, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8479166746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.233888578414917, + "sampling/importance_sampling_ratio/max": 1.944665217399597, + "sampling/importance_sampling_ratio/mean": 0.9999553084373474, + "sampling/importance_sampling_ratio/min": 0.2975509911775589, + "sampling/sampling_logp_difference/max": 1.2434906959533691, + "sampling/sampling_logp_difference/mean": 0.012770450860261916, + "step": 2545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1660.2, + "completions/max_terminated_length": 1660.2, + "completions/mean_length": 1165.56875, + "completions/mean_terminated_length": 1165.56875, + "completions/min_length": 847.8, + "completions/min_terminated_length": 847.8, + "entropy": 0.24514368772506714, + "epoch": 2.9964747356051706, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4213135242462158, + "learning_rate": 2.0668766658589773e-07, + "loss": -0.0007, + "num_tokens": 343836915.0, + "reward": 0.801562511920929, + "reward_std": 0.07066599875688553, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.801562511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27490745335817335, + "sampling/importance_sampling_ratio/max": 1.9496089696884156, + "sampling/importance_sampling_ratio/mean": 1.0000056385993958, + "sampling/importance_sampling_ratio/min": 0.3634680390357971, + "sampling/sampling_logp_difference/max": 1.1743841171264648, + "sampling/sampling_logp_difference/mean": 0.012745716236531734, + "step": 2550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1566.8, + "completions/max_terminated_length": 1566.8, + "completions/mean_length": 1120.646875, + "completions/mean_terminated_length": 1120.646875, + "completions/min_length": 823.2, + "completions/min_terminated_length": 823.2, + "entropy": 0.23687808513641356, + "epoch": 3.00235017626322, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 2.060818996850012e-07, + "loss": 0.0039, + "num_tokens": 344543858.0, + "reward": 0.7984375119209289, + "reward_std": 0.07464845180511474, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7984375119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3012055605649948, + "sampling/importance_sampling_ratio/max": 1.9975390911102295, + "sampling/importance_sampling_ratio/mean": 1.0000576019287108, + "sampling/importance_sampling_ratio/min": 0.38463932275772095, + "sampling/sampling_logp_difference/max": 1.1026000499725341, + "sampling/sampling_logp_difference/mean": 0.012608602643013, + "step": 2555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1455.6, + "completions/max_terminated_length": 1455.6, + "completions/mean_length": 1078.309375, + "completions/mean_terminated_length": 1078.309375, + "completions/min_length": 782.2, + "completions/min_terminated_length": 782.2, + "entropy": 0.22939117550849913, + "epoch": 3.008225616921269, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5061334371566772, + "learning_rate": 2.0547613278410465e-07, + "loss": -0.0049, + "num_tokens": 345197557.0, + "reward": 0.8630208373069763, + "reward_std": 0.06913707032799721, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8630208373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2338823139667511, + "sampling/importance_sampling_ratio/max": 1.964646315574646, + "sampling/importance_sampling_ratio/mean": 0.9999775528907776, + "sampling/importance_sampling_ratio/min": 0.32452565338809547, + "sampling/sampling_logp_difference/max": 4.082516860961914, + "sampling/sampling_logp_difference/mean": 0.012144268304109574, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1498.0, + "completions/max_terminated_length": 1498.0, + "completions/mean_length": 1094.321875, + "completions/mean_terminated_length": 1094.321875, + "completions/min_length": 773.6, + "completions/min_terminated_length": 773.6, + "entropy": 0.23587908148765563, + "epoch": 3.0141010575793183, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.568652868270874, + "learning_rate": 2.0487036588320814e-07, + "loss": 0.0004, + "num_tokens": 345907900.0, + "reward": 0.8179687619209289, + "reward_std": 0.07180419340729713, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8179687619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2588413327932358, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000184774398804, + "sampling/importance_sampling_ratio/min": 0.22604906580163514, + "sampling/sampling_logp_difference/max": 3.168667030334473, + "sampling/sampling_logp_difference/mean": 0.012671238370239734, + "step": 2565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1898.6, + "completions/max_terminated_length": 1898.6, + "completions/mean_length": 1210.071875, + "completions/mean_terminated_length": 1210.071875, + "completions/min_length": 864.6, + "completions/min_terminated_length": 864.6, + "entropy": 0.2588326156139374, + "epoch": 3.0199764982373676, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.80168217420578, + "learning_rate": 2.042645989823116e-07, + "loss": 0.0028, + "num_tokens": 346622019.0, + "reward": 0.7994791746139527, + "reward_std": 0.1147149682044983, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7994791746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29545135200023653, + "sampling/importance_sampling_ratio/max": 1.985557770729065, + "sampling/importance_sampling_ratio/mean": 1.0000051379203796, + "sampling/importance_sampling_ratio/min": 0.32981371879577637, + "sampling/sampling_logp_difference/max": 1.1296478509902954, + "sampling/sampling_logp_difference/mean": 0.013445395790040494, + "step": 2570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1761.2, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 1148.53125, + "completions/mean_terminated_length": 1144.4665771484374, + "completions/min_length": 832.6, + "completions/min_terminated_length": 832.6, + "entropy": 0.26161502599716185, + "epoch": 3.0258519388954173, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8263538479804993, + "learning_rate": 2.0365883208141507e-07, + "loss": -0.0032, + "num_tokens": 347319897.0, + "reward": 0.861718761920929, + "reward_std": 0.08996040225028992, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.861718761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22977930754423143, + "sampling/importance_sampling_ratio/max": 1.9648932695388794, + "sampling/importance_sampling_ratio/mean": 1.000153088569641, + "sampling/importance_sampling_ratio/min": 0.30643958374857905, + "sampling/sampling_logp_difference/max": 1.5094314098358155, + "sampling/sampling_logp_difference/mean": 0.013726024515926838, + "step": 2575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 1160.00625, + "completions/mean_terminated_length": 1160.00625, + "completions/min_length": 909.2, + "completions/min_terminated_length": 909.2, + "entropy": 0.248623988032341, + "epoch": 3.0317273795534665, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 2.0305306518051853e-07, + "loss": 0.0021, + "num_tokens": 347988235.0, + "reward": 0.8042708396911621, + "reward_std": 0.06089485287666321, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8042708396911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2598120987415314, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000272750854493, + "sampling/importance_sampling_ratio/min": 0.29968391843140124, + "sampling/sampling_logp_difference/max": 1.8056718349456786, + "sampling/sampling_logp_difference/mean": 0.01313832849264145, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1599.6, + "completions/max_terminated_length": 1599.6, + "completions/mean_length": 1228.05625, + "completions/mean_terminated_length": 1228.05625, + "completions/min_length": 894.6, + "completions/min_terminated_length": 894.6, + "entropy": 0.25133021771907804, + "epoch": 3.0376028202115157, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6442340612411499, + "learning_rate": 2.0244729827962197e-07, + "loss": 0.0001, + "num_tokens": 348676301.0, + "reward": 0.8003125071525574, + "reward_std": 0.11148321777582168, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8003125190734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27468371093273164, + "sampling/importance_sampling_ratio/max": 1.8601706743240356, + "sampling/importance_sampling_ratio/mean": 0.9999647378921509, + "sampling/importance_sampling_ratio/min": 0.38169229626655576, + "sampling/sampling_logp_difference/max": 1.0052381753921509, + "sampling/sampling_logp_difference/mean": 0.012974300980567932, + "step": 2585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1565.2, + "completions/max_terminated_length": 1565.2, + "completions/mean_length": 1092.753125, + "completions/mean_terminated_length": 1092.753125, + "completions/min_length": 796.8, + "completions/min_terminated_length": 796.8, + "entropy": 0.25149821043014525, + "epoch": 3.0434782608695654, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.39658764004707336, + "learning_rate": 2.0184153137872546e-07, + "loss": -0.0003, + "num_tokens": 349333486.0, + "reward": 0.8057291865348816, + "reward_std": 0.07588431015610694, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8057291865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2633956283330917, + "sampling/importance_sampling_ratio/max": 1.9642632484436036, + "sampling/importance_sampling_ratio/mean": 1.0000943183898925, + "sampling/importance_sampling_ratio/min": 0.43040544986724855, + "sampling/sampling_logp_difference/max": 1.0892863035202027, + "sampling/sampling_logp_difference/mean": 0.013009889796376229, + "step": 2590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1633.4, + "completions/max_terminated_length": 1633.4, + "completions/mean_length": 1181.28125, + "completions/mean_terminated_length": 1181.28125, + "completions/min_length": 899.2, + "completions/min_terminated_length": 899.2, + "entropy": 0.24790201783180238, + "epoch": 3.0493537015276146, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0, + "learning_rate": 2.0123576447782892e-07, + "loss": -0.0011, + "num_tokens": 350027192.0, + "reward": 0.901562511920929, + "reward_std": 0.09831328690052032, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9015625238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18027370274066926, + "sampling/importance_sampling_ratio/max": 1.9673321008682252, + "sampling/importance_sampling_ratio/mean": 1.0000709176063538, + "sampling/importance_sampling_ratio/min": 0.36134466230869294, + "sampling/sampling_logp_difference/max": 1.0996488094329835, + "sampling/sampling_logp_difference/mean": 0.012829454429447652, + "step": 2595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1514.4, + "completions/max_terminated_length": 1514.4, + "completions/mean_length": 1094.546875, + "completions/mean_terminated_length": 1094.546875, + "completions/min_length": 762.0, + "completions/min_terminated_length": 762.0, + "entropy": 0.24174076914787293, + "epoch": 3.055229142185664, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.3839050829410553, + "learning_rate": 2.0062999757693238e-07, + "loss": -0.0012, + "num_tokens": 350717863.0, + "reward": 0.8422916889190674, + "reward_std": 0.10253577679395676, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8422916889190674, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20914290547370912, + "sampling/importance_sampling_ratio/max": 1.9016016483306886, + "sampling/importance_sampling_ratio/mean": 0.9999129056930542, + "sampling/importance_sampling_ratio/min": 0.3612874448299408, + "sampling/sampling_logp_difference/max": 1.1594652891159059, + "sampling/sampling_logp_difference/mean": 0.013087780401110648, + "step": 2600 + }, + { + "epoch": 3.055229142185664, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1527.92, + "eval_completions/max_terminated_length": 1527.92, + "eval_completions/mean_length": 1115.324375, + "eval_completions/mean_terminated_length": 1115.324375, + "eval_completions/min_length": 830.56, + "eval_completions/min_terminated_length": 830.56, + "eval_entropy": 0.24745357692241668, + "eval_frac_reward_zero_std": 0.59, + "eval_loss": 0.0014860860537737608, + "eval_num_tokens": 350717863.0, + "eval_reward": 0.764208345413208, + "eval_reward_std": 0.08211088687181473, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7642083430290222, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2958526027202606, + "eval_runtime": 417.5732, + "eval_samples_per_second": 0.239, + "eval_sampling/importance_sampling_ratio/max": 1.9392410516738892, + "eval_sampling/importance_sampling_ratio/mean": 1.0000256299972534, + "eval_sampling/importance_sampling_ratio/min": 0.30997018457235326, + "eval_sampling/sampling_logp_difference/max": 2.1229168796539306, + "eval_sampling/sampling_logp_difference/mean": 0.013145581409335137, + "eval_steps_per_second": 0.005, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1621.6, + "completions/max_terminated_length": 1563.4, + "completions/mean_length": 1089.31875, + "completions/mean_terminated_length": 1076.3976806640626, + "completions/min_length": 790.0, + "completions/min_terminated_length": 790.0, + "entropy": 0.23385756611824035, + "epoch": 3.061104582843713, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.890629231929779, + "learning_rate": 2.0002423067603585e-07, + "loss": -0.0198, + "num_tokens": 351392561.0, + "reward": 0.7359375119209289, + "reward_std": 0.17164357751607895, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7359375119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3002436876296997, + "sampling/importance_sampling_ratio/max": 1.9385489702224732, + "sampling/importance_sampling_ratio/mean": 1.0000449180603028, + "sampling/importance_sampling_ratio/min": 0.430584990978241, + "sampling/sampling_logp_difference/max": 0.9488114356994629, + "sampling/sampling_logp_difference/mean": 0.012672055885195732, + "step": 2605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1486.8, + "completions/max_terminated_length": 1486.8, + "completions/mean_length": 1123.546875, + "completions/mean_terminated_length": 1123.546875, + "completions/min_length": 811.6, + "completions/min_terminated_length": 811.6, + "entropy": 0.24552173912525177, + "epoch": 3.066980023501763, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 1.994184637751393e-07, + "loss": -0.0008, + "num_tokens": 352087152.0, + "reward": 0.8692708492279053, + "reward_std": 0.10569213405251503, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8692708611488342, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21039358377456666, + "sampling/importance_sampling_ratio/max": 1.9522597312927246, + "sampling/importance_sampling_ratio/mean": 0.999960207939148, + "sampling/importance_sampling_ratio/min": 0.3872329980134964, + "sampling/sampling_logp_difference/max": 1.0530877590179444, + "sampling/sampling_logp_difference/mean": 0.01298385914415121, + "step": 2610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1603.4, + "completions/max_terminated_length": 1603.4, + "completions/mean_length": 1123.93125, + "completions/mean_terminated_length": 1123.93125, + "completions/min_length": 772.4, + "completions/min_terminated_length": 772.4, + "entropy": 0.23556230068206788, + "epoch": 3.072855464159812, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.8286966681480408, + "learning_rate": 1.988126968742428e-07, + "loss": -0.003, + "num_tokens": 352771130.0, + "reward": 0.7554687619209289, + "reward_std": 0.05466256886720657, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7554687619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2613639414310455, + "sampling/importance_sampling_ratio/max": 1.8773372888565063, + "sampling/importance_sampling_ratio/mean": 0.9999829411506653, + "sampling/importance_sampling_ratio/min": 0.374179807305336, + "sampling/sampling_logp_difference/max": 1.0663438320159913, + "sampling/sampling_logp_difference/mean": 0.012645265832543374, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1776.4, + "completions/max_terminated_length": 1694.8, + "completions/mean_length": 1166.609375, + "completions/mean_terminated_length": 1162.793359375, + "completions/min_length": 750.8, + "completions/min_terminated_length": 750.8, + "entropy": 0.23736243844032287, + "epoch": 3.0787309048178613, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.40863195061683655, + "learning_rate": 1.9820692997334626e-07, + "loss": -0.0048, + "num_tokens": 353456681.0, + "reward": 0.8520833492279053, + "reward_std": 0.06194302663207054, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8520833492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2339508056640625, + "sampling/importance_sampling_ratio/max": 1.8806124925613403, + "sampling/importance_sampling_ratio/mean": 1.0000197887420654, + "sampling/importance_sampling_ratio/min": 0.3954928398132324, + "sampling/sampling_logp_difference/max": 0.9414288401603699, + "sampling/sampling_logp_difference/mean": 0.012598930113017558, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1588.8, + "completions/max_terminated_length": 1588.8, + "completions/mean_length": 1101.5, + "completions/mean_terminated_length": 1101.5, + "completions/min_length": 774.2, + "completions/min_terminated_length": 774.2, + "entropy": 0.2684099614620209, + "epoch": 3.0846063454759105, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6336551308631897, + "learning_rate": 1.976011630724497e-07, + "loss": -0.0025, + "num_tokens": 354135657.0, + "reward": 0.8446875095367432, + "reward_std": 0.06188718155026436, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8446875095367432, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2825276643037796, + "sampling/importance_sampling_ratio/max": 1.9036636829376221, + "sampling/importance_sampling_ratio/mean": 0.9998575687408447, + "sampling/importance_sampling_ratio/min": 0.34003419876098634, + "sampling/sampling_logp_difference/max": 1.3024271130561829, + "sampling/sampling_logp_difference/mean": 0.013950708508491515, + "step": 2625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1805.8, + "completions/max_terminated_length": 1805.8, + "completions/mean_length": 1192.175, + "completions/mean_terminated_length": 1192.175, + "completions/min_length": 844.6, + "completions/min_terminated_length": 844.6, + "entropy": 0.2670820116996765, + "epoch": 3.09048178613396, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 1.9699539617155316e-07, + "loss": 0.0015, + "num_tokens": 354847601.0, + "reward": 0.7770833492279052, + "reward_std": 0.055994272232055664, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7770833492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28656685948371885, + "sampling/importance_sampling_ratio/max": 1.9876051187515258, + "sampling/importance_sampling_ratio/mean": 0.9998288512229919, + "sampling/importance_sampling_ratio/min": 0.3348564386367798, + "sampling/sampling_logp_difference/max": 1.1707114219665526, + "sampling/sampling_logp_difference/mean": 0.013887059316039086, + "step": 2630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1861.8, + "completions/max_terminated_length": 1836.0, + "completions/mean_length": 1216.940625, + "completions/mean_terminated_length": 1209.6089599609375, + "completions/min_length": 830.6, + "completions/min_terminated_length": 830.6, + "entropy": 0.25017284154891967, + "epoch": 3.0963572267920094, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6755982637405396, + "learning_rate": 1.9638962927065662e-07, + "loss": -0.0063, + "num_tokens": 355569526.0, + "reward": 0.7526041984558105, + "reward_std": 0.1029469721019268, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7526041984558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3308384269475937, + "sampling/importance_sampling_ratio/max": 1.9819922924041748, + "sampling/importance_sampling_ratio/mean": 1.0000592708587646, + "sampling/importance_sampling_ratio/min": 0.35656105279922484, + "sampling/sampling_logp_difference/max": 1.0571811199188232, + "sampling/sampling_logp_difference/mean": 0.013306570611894131, + "step": 2635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1624.4, + "completions/max_terminated_length": 1624.4, + "completions/mean_length": 1158.015625, + "completions/mean_terminated_length": 1158.015625, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.2682430505752563, + "epoch": 3.1022326674500587, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8624192476272583, + "learning_rate": 1.9578386236976011e-07, + "loss": 0.0081, + "num_tokens": 356236859.0, + "reward": 0.8321875095367431, + "reward_std": 0.09990925379097462, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8321875095367431, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23295383900403976, + "sampling/importance_sampling_ratio/max": 1.927258276939392, + "sampling/importance_sampling_ratio/mean": 1.000026774406433, + "sampling/importance_sampling_ratio/min": 0.3010182499885559, + "sampling/sampling_logp_difference/max": 1.337507677078247, + "sampling/sampling_logp_difference/mean": 0.013790984638035297, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1496.6, + "completions/max_terminated_length": 1496.6, + "completions/mean_length": 1131.265625, + "completions/mean_terminated_length": 1131.265625, + "completions/min_length": 816.2, + "completions/min_terminated_length": 816.2, + "entropy": 0.23955595791339873, + "epoch": 3.108108108108108, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4020726680755615, + "learning_rate": 1.9517809546886358e-07, + "loss": 0.002, + "num_tokens": 356920928.0, + "reward": 0.7802083492279053, + "reward_std": 0.07774590328335762, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7802083492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30067052841186526, + "sampling/importance_sampling_ratio/max": 1.9295808315277099, + "sampling/importance_sampling_ratio/mean": 0.999874758720398, + "sampling/importance_sampling_ratio/min": 0.3665719389915466, + "sampling/sampling_logp_difference/max": 1.078517460823059, + "sampling/sampling_logp_difference/mean": 0.012707036547362804, + "step": 2645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1511.6, + "completions/max_terminated_length": 1511.6, + "completions/mean_length": 1159.884375, + "completions/mean_terminated_length": 1159.884375, + "completions/min_length": 906.2, + "completions/min_terminated_length": 906.2, + "entropy": 0.23638878464698793, + "epoch": 3.1139835487661576, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7374940514564514, + "learning_rate": 1.9457232856796704e-07, + "loss": 0.0033, + "num_tokens": 357597563.0, + "reward": 0.8471354246139526, + "reward_std": 0.08411812335252762, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8471354246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19521130323410035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999265074729919, + "sampling/importance_sampling_ratio/min": 0.253702437877655, + "sampling/sampling_logp_difference/max": 1.50885751247406, + "sampling/sampling_logp_difference/mean": 0.01256478950381279, + "step": 2650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1847.4, + "completions/max_terminated_length": 1847.4, + "completions/mean_length": 1120.740625, + "completions/mean_terminated_length": 1120.740625, + "completions/min_length": 848.4, + "completions/min_terminated_length": 848.4, + "entropy": 0.2554923087358475, + "epoch": 3.119858989424207, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.7112350463867188, + "learning_rate": 1.939665616670705e-07, + "loss": 0.0001, + "num_tokens": 358283448.0, + "reward": 0.7602083384990692, + "reward_std": 0.07785547077655793, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7602083384990692, + "rewards/e2e_recall_precision_mixed_reward/std": 0.296068063378334, + "sampling/importance_sampling_ratio/max": 1.9592190265655518, + "sampling/importance_sampling_ratio/mean": 1.000104260444641, + "sampling/importance_sampling_ratio/min": 0.33376007676124575, + "sampling/sampling_logp_difference/max": 1.1508065223693849, + "sampling/sampling_logp_difference/mean": 0.01355198472738266, + "step": 2655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1480.2, + "completions/max_terminated_length": 1480.2, + "completions/mean_length": 1106.128125, + "completions/mean_terminated_length": 1106.128125, + "completions/min_length": 762.4, + "completions/min_terminated_length": 762.4, + "entropy": 0.22904159724712372, + "epoch": 3.125734430082256, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7636279463768005, + "learning_rate": 1.93360794766174e-07, + "loss": 0.0013, + "num_tokens": 358951905.0, + "reward": 0.8581250190734864, + "reward_std": 0.0991522267460823, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8581250190734864, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20884974002838136, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999507665634155, + "sampling/importance_sampling_ratio/min": 0.3788798153400421, + "sampling/sampling_logp_difference/max": 0.9996652841567993, + "sampling/sampling_logp_difference/mean": 0.012032361328601837, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1593.2, + "completions/max_terminated_length": 1593.2, + "completions/mean_length": 1190.990625, + "completions/mean_terminated_length": 1190.990625, + "completions/min_length": 888.6, + "completions/min_terminated_length": 888.6, + "entropy": 0.2583101183176041, + "epoch": 3.1316098707403057, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.444450706243515, + "learning_rate": 1.9275502786527743e-07, + "loss": -0.0021, + "num_tokens": 359662430.0, + "reward": 0.7358854293823243, + "reward_std": 0.10120119452476502, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7358854293823243, + "rewards/e2e_recall_precision_mixed_reward/std": 0.35854376256465914, + "sampling/importance_sampling_ratio/max": 1.9906119108200073, + "sampling/importance_sampling_ratio/mean": 0.9998132705688476, + "sampling/importance_sampling_ratio/min": 0.28991485238075254, + "sampling/sampling_logp_difference/max": 1.2943898916244507, + "sampling/sampling_logp_difference/mean": 0.013436655886471272, + "step": 2665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1751.8, + "completions/max_terminated_length": 1751.8, + "completions/mean_length": 1202.046875, + "completions/mean_terminated_length": 1202.046875, + "completions/min_length": 846.2, + "completions/min_terminated_length": 846.2, + "entropy": 0.25281980633735657, + "epoch": 3.137485311398355, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.7313327789306641, + "learning_rate": 1.921492609643809e-07, + "loss": -0.0009, + "num_tokens": 360371181.0, + "reward": 0.7715625166893005, + "reward_std": 0.13862871527671813, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7715625166893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32820015847682954, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000031113624572, + "sampling/importance_sampling_ratio/min": 0.4037406623363495, + "sampling/sampling_logp_difference/max": 1.0691129326820374, + "sampling/sampling_logp_difference/mean": 0.013268834352493286, + "step": 2670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1509.4, + "completions/max_terminated_length": 1509.4, + "completions/mean_length": 1149.64375, + "completions/mean_terminated_length": 1149.64375, + "completions/min_length": 883.0, + "completions/min_terminated_length": 883.0, + "entropy": 0.25547915995121, + "epoch": 3.143360752056404, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.43228134512901306, + "learning_rate": 1.9154349406348435e-07, + "loss": -0.002, + "num_tokens": 361056763.0, + "reward": 0.9020833492279052, + "reward_std": 0.06553339175879955, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9020833492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1975732207298279, + "sampling/importance_sampling_ratio/max": 1.9426953315734863, + "sampling/importance_sampling_ratio/mean": 1.000106644630432, + "sampling/importance_sampling_ratio/min": 0.338831490278244, + "sampling/sampling_logp_difference/max": 1.2730365514755249, + "sampling/sampling_logp_difference/mean": 0.01309330053627491, + "step": 2675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1539.4, + "completions/max_terminated_length": 1539.4, + "completions/mean_length": 1171.35625, + "completions/mean_terminated_length": 1171.35625, + "completions/min_length": 934.2, + "completions/min_terminated_length": 934.2, + "entropy": 0.2575560688972473, + "epoch": 3.1492361927144534, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0, + "learning_rate": 1.9093772716258782e-07, + "loss": -0.0022, + "num_tokens": 361735949.0, + "reward": 0.7588020920753479, + "reward_std": 0.09332804828882217, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7588020920753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3088227480649948, + "sampling/importance_sampling_ratio/max": 1.981947898864746, + "sampling/importance_sampling_ratio/mean": 0.9999451875686646, + "sampling/importance_sampling_ratio/min": 0.3808894753456116, + "sampling/sampling_logp_difference/max": 0.9771119594573975, + "sampling/sampling_logp_difference/mean": 0.013271708972752094, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1766.6, + "completions/max_terminated_length": 1766.6, + "completions/mean_length": 1225.53125, + "completions/mean_terminated_length": 1225.53125, + "completions/min_length": 905.6, + "completions/min_terminated_length": 905.6, + "entropy": 0.2539756655693054, + "epoch": 3.155111633372503, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.5203064680099487, + "learning_rate": 1.903319602616913e-07, + "loss": 0.0048, + "num_tokens": 362455095.0, + "reward": 0.8886458516120911, + "reward_std": 0.11133290827274323, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8886458516120911, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1978231579065323, + "sampling/importance_sampling_ratio/max": 1.8321751832962037, + "sampling/importance_sampling_ratio/mean": 1.0000116109848023, + "sampling/importance_sampling_ratio/min": 0.3633933126926422, + "sampling/sampling_logp_difference/max": 1.0385831832885741, + "sampling/sampling_logp_difference/mean": 0.01303493045270443, + "step": 2685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1516.4, + "completions/max_terminated_length": 1516.4, + "completions/mean_length": 1171.89375, + "completions/mean_terminated_length": 1171.89375, + "completions/min_length": 912.4, + "completions/min_terminated_length": 912.4, + "entropy": 0.25234430730342866, + "epoch": 3.1609870740305523, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6282800436019897, + "learning_rate": 1.8972619336079477e-07, + "loss": 0.0072, + "num_tokens": 363167109.0, + "reward": 0.752343761920929, + "reward_std": 0.09111793488264083, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7523437738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30076233446598055, + "sampling/importance_sampling_ratio/max": 1.8481305837631226, + "sampling/importance_sampling_ratio/mean": 0.9998624682426452, + "sampling/importance_sampling_ratio/min": 0.39715090990066526, + "sampling/sampling_logp_difference/max": 0.9621063709259033, + "sampling/sampling_logp_difference/mean": 0.01284482330083847, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1668.6, + "completions/max_terminated_length": 1668.6, + "completions/mean_length": 1182.228125, + "completions/mean_terminated_length": 1182.228125, + "completions/min_length": 896.4, + "completions/min_terminated_length": 896.4, + "entropy": 0.2626140534877777, + "epoch": 3.1668625146886016, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6545633673667908, + "learning_rate": 1.8912042645989823e-07, + "loss": -0.0016, + "num_tokens": 363846078.0, + "reward": 0.8580729246139527, + "reward_std": 0.11600885093212128, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8580729246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22339286804199218, + "sampling/importance_sampling_ratio/max": 1.9617747783660888, + "sampling/importance_sampling_ratio/mean": 0.9999674677848815, + "sampling/importance_sampling_ratio/min": 0.3762934744358063, + "sampling/sampling_logp_difference/max": 1.0206952333450316, + "sampling/sampling_logp_difference/mean": 0.013403966650366783, + "step": 2695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1689.8, + "completions/max_terminated_length": 1689.8, + "completions/mean_length": 1165.94375, + "completions/mean_terminated_length": 1165.94375, + "completions/min_length": 817.2, + "completions/min_terminated_length": 817.2, + "entropy": 0.24900731146335603, + "epoch": 3.172737955346651, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6175356507301331, + "learning_rate": 1.885146595590017e-07, + "loss": 0.0045, + "num_tokens": 364534908.0, + "reward": 0.881250011920929, + "reward_std": 0.04110444188117981, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.881250011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21535277664661406, + "sampling/importance_sampling_ratio/max": 1.8730939149856567, + "sampling/importance_sampling_ratio/mean": 1.0000895857810974, + "sampling/importance_sampling_ratio/min": 0.3922951459884644, + "sampling/sampling_logp_difference/max": 0.9598684906959534, + "sampling/sampling_logp_difference/mean": 0.01295645758509636, + "step": 2700 + }, + { + "epoch": 3.172737955346651, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1605.36, + "eval_completions/max_terminated_length": 1605.36, + "eval_completions/mean_length": 1145.510625, + "eval_completions/mean_terminated_length": 1145.510625, + "eval_completions/min_length": 868.64, + "eval_completions/min_terminated_length": 868.64, + "eval_entropy": 0.25542369663715364, + "eval_frac_reward_zero_std": 0.62, + "eval_loss": 0.002036468591541052, + "eval_num_tokens": 364534908.0, + "eval_reward": 0.7656979310512543, + "eval_reward_std": 0.07679802820086479, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7656979298591614, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2901380580663681, + "eval_runtime": 440.2857, + "eval_samples_per_second": 0.227, + "eval_sampling/importance_sampling_ratio/max": 1.976495280265808, + "eval_sampling/importance_sampling_ratio/mean": 1.0000054264068603, + "eval_sampling/importance_sampling_ratio/min": 0.33669356286525726, + "eval_sampling/sampling_logp_difference/max": 1.1901243042945862, + "eval_sampling/sampling_logp_difference/mean": 0.013190858326852321, + "eval_steps_per_second": 0.005, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1389.0, + "completions/max_terminated_length": 1389.0, + "completions/mean_length": 1077.025, + "completions/mean_terminated_length": 1077.025, + "completions/min_length": 785.2, + "completions/min_terminated_length": 785.2, + "entropy": 0.22875811159610748, + "epoch": 3.1786133960047005, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5512154698371887, + "learning_rate": 1.8790889265810513e-07, + "loss": 0.0026, + "num_tokens": 365209364.0, + "reward": 0.950000011920929, + "reward_std": 0.06352402791380882, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.950000011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13289770856499672, + "sampling/importance_sampling_ratio/max": 1.9655184507369996, + "sampling/importance_sampling_ratio/mean": 0.999955701828003, + "sampling/importance_sampling_ratio/min": 0.4531009137630463, + "sampling/sampling_logp_difference/max": 0.8712506055831909, + "sampling/sampling_logp_difference/mean": 0.011957179754972458, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1785.4, + "completions/max_terminated_length": 1785.4, + "completions/mean_length": 1248.996875, + "completions/mean_terminated_length": 1248.996875, + "completions/min_length": 848.6, + "completions/min_terminated_length": 848.6, + "entropy": 0.2754764974117279, + "epoch": 3.1844888366627497, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0, + "learning_rate": 1.8730312575720862e-07, + "loss": 0.0031, + "num_tokens": 365918243.0, + "reward": 0.7947916746139526, + "reward_std": 0.07183088660240174, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7947916746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2950818926095963, + "sampling/importance_sampling_ratio/max": 1.963321566581726, + "sampling/importance_sampling_ratio/mean": 0.9999127626419068, + "sampling/importance_sampling_ratio/min": 0.33920138217235946, + "sampling/sampling_logp_difference/max": 3.2019110441207888, + "sampling/sampling_logp_difference/mean": 0.013784621469676494, + "step": 2710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.2, + "completions/max_terminated_length": 1431.2, + "completions/mean_length": 1097.05, + "completions/mean_terminated_length": 1097.05, + "completions/min_length": 799.8, + "completions/min_terminated_length": 799.8, + "entropy": 0.23867568373680115, + "epoch": 3.190364277320799, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5815135836601257, + "learning_rate": 1.8669735885631208e-07, + "loss": 0.0015, + "num_tokens": 366575027.0, + "reward": 0.839062511920929, + "reward_std": 0.11312199383974075, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.839062511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25024967789649966, + "sampling/importance_sampling_ratio/max": 1.9745015382766724, + "sampling/importance_sampling_ratio/mean": 0.9999501466751098, + "sampling/importance_sampling_ratio/min": 0.2752092361450195, + "sampling/sampling_logp_difference/max": 1.4468851327896117, + "sampling/sampling_logp_difference/mean": 0.012776543572545051, + "step": 2715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1545.8, + "completions/max_terminated_length": 1545.8, + "completions/mean_length": 1100.778125, + "completions/mean_terminated_length": 1100.778125, + "completions/min_length": 741.6, + "completions/min_terminated_length": 741.6, + "entropy": 0.23921369910240173, + "epoch": 3.196239717978848, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.3939354121685028, + "learning_rate": 1.8609159195541555e-07, + "loss": -0.0015, + "num_tokens": 367257964.0, + "reward": 0.71171875, + "reward_std": 0.04870991818606853, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.711718761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3381441444158554, + "sampling/importance_sampling_ratio/max": 1.9069401264190673, + "sampling/importance_sampling_ratio/mean": 1.0001029253005982, + "sampling/importance_sampling_ratio/min": 0.37758385539054873, + "sampling/sampling_logp_difference/max": 1.0444597005844116, + "sampling/sampling_logp_difference/mean": 0.012857604771852493, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1508.4, + "completions/max_terminated_length": 1508.4, + "completions/mean_length": 1143.01875, + "completions/mean_terminated_length": 1143.01875, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.25859539210796356, + "epoch": 3.202115158636898, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 1.85485825054519e-07, + "loss": 0.0021, + "num_tokens": 367949282.0, + "reward": 0.8816145896911621, + "reward_std": 0.04836602807044983, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8816145896911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.15824837386608123, + "sampling/importance_sampling_ratio/max": 1.9535146951675415, + "sampling/importance_sampling_ratio/mean": 0.9999256372451782, + "sampling/importance_sampling_ratio/min": 0.3492464393377304, + "sampling/sampling_logp_difference/max": 1.4997458934783936, + "sampling/sampling_logp_difference/mean": 0.01358124241232872, + "step": 2725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1605.8, + "completions/max_terminated_length": 1605.8, + "completions/mean_length": 1144.803125, + "completions/mean_terminated_length": 1144.803125, + "completions/min_length": 821.6, + "completions/min_terminated_length": 821.6, + "entropy": 0.23201414942741394, + "epoch": 3.207990599294947, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.4138811528682709, + "learning_rate": 1.8488005815362247e-07, + "loss": 0.0035, + "num_tokens": 368656163.0, + "reward": 0.8598958492279053, + "reward_std": 0.13527624905109406, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8598958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21621148884296418, + "sampling/importance_sampling_ratio/max": 1.981991744041443, + "sampling/importance_sampling_ratio/mean": 0.9999581217765808, + "sampling/importance_sampling_ratio/min": 0.25117518454790116, + "sampling/sampling_logp_difference/max": 1.7178606867790223, + "sampling/sampling_logp_difference/mean": 0.01244208738207817, + "step": 2730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1546.2, + "completions/max_terminated_length": 1546.2, + "completions/mean_length": 1133.43125, + "completions/mean_terminated_length": 1133.43125, + "completions/min_length": 841.0, + "completions/min_terminated_length": 841.0, + "entropy": 0.24755406975746155, + "epoch": 3.2138660399529964, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6708360910415649, + "learning_rate": 1.8427429125272596e-07, + "loss": 0.0021, + "num_tokens": 369316909.0, + "reward": 0.8466145992279053, + "reward_std": 0.05173058435320854, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8466145992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2515969604253769, + "sampling/importance_sampling_ratio/max": 1.9390371322631836, + "sampling/importance_sampling_ratio/mean": 0.9999060034751892, + "sampling/importance_sampling_ratio/min": 0.317415851354599, + "sampling/sampling_logp_difference/max": 1.308079767227173, + "sampling/sampling_logp_difference/mean": 0.012958027422428131, + "step": 2735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1652.6, + "completions/max_terminated_length": 1652.6, + "completions/mean_length": 1203.203125, + "completions/mean_terminated_length": 1203.203125, + "completions/min_length": 840.4, + "completions/min_terminated_length": 840.4, + "entropy": 0.260967755317688, + "epoch": 3.219741480611046, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7388750314712524, + "learning_rate": 1.8366852435182942e-07, + "loss": -0.0027, + "num_tokens": 370055742.0, + "reward": 0.8242187619209289, + "reward_std": 0.08670442402362824, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8242187619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25668842494487765, + "sampling/importance_sampling_ratio/max": 1.9822523593902588, + "sampling/importance_sampling_ratio/mean": 1.0000456333160401, + "sampling/importance_sampling_ratio/min": 0.4507920503616333, + "sampling/sampling_logp_difference/max": 0.8359850645065308, + "sampling/sampling_logp_difference/mean": 0.013544493354856968, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1580.0, + "completions/max_terminated_length": 1580.0, + "completions/mean_length": 1152.165625, + "completions/mean_terminated_length": 1152.165625, + "completions/min_length": 845.6, + "completions/min_terminated_length": 845.6, + "entropy": 0.24874697625637054, + "epoch": 3.2256169212690953, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6925174593925476, + "learning_rate": 1.830627574509329e-07, + "loss": 0.0021, + "num_tokens": 370757203.0, + "reward": 0.9194791913032532, + "reward_std": 0.08012870997190476, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9194791913032532, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1872466579079628, + "sampling/importance_sampling_ratio/max": 1.917311930656433, + "sampling/importance_sampling_ratio/mean": 1.0000473737716675, + "sampling/importance_sampling_ratio/min": 0.3609789401292801, + "sampling/sampling_logp_difference/max": 1.0609714031219482, + "sampling/sampling_logp_difference/mean": 0.013005911372601986, + "step": 2745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1640.8, + "completions/max_terminated_length": 1640.8, + "completions/mean_length": 1168.940625, + "completions/mean_terminated_length": 1168.940625, + "completions/min_length": 891.2, + "completions/min_terminated_length": 891.2, + "entropy": 0.25572892725467683, + "epoch": 3.2314923619271445, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.42577195167541504, + "learning_rate": 1.8245699055003632e-07, + "loss": -0.0107, + "num_tokens": 371470048.0, + "reward": 0.8330729484558106, + "reward_std": 0.1004362728446722, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8330729484558106, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2636726438999176, + "sampling/importance_sampling_ratio/max": 1.963629388809204, + "sampling/importance_sampling_ratio/mean": 1.000008511543274, + "sampling/importance_sampling_ratio/min": 0.3254010289907455, + "sampling/sampling_logp_difference/max": 1.3843281507492065, + "sampling/sampling_logp_difference/mean": 0.013244516961276532, + "step": 2750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1605.8, + "completions/max_terminated_length": 1605.8, + "completions/mean_length": 1150.315625, + "completions/mean_terminated_length": 1150.315625, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.27356540560722353, + "epoch": 3.2373678025851937, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6596607565879822, + "learning_rate": 1.8185122364913979e-07, + "loss": -0.0018, + "num_tokens": 372153701.0, + "reward": 0.8119791865348815, + "reward_std": 0.07490371987223625, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8119791865348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26950059831142426, + "sampling/importance_sampling_ratio/max": 1.9359875917434692, + "sampling/importance_sampling_ratio/mean": 1.0000138759613038, + "sampling/importance_sampling_ratio/min": 0.3807866334915161, + "sampling/sampling_logp_difference/max": 1.138900876045227, + "sampling/sampling_logp_difference/mean": 0.014055828377604485, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1861.2, + "completions/max_terminated_length": 1861.2, + "completions/mean_length": 1226.828125, + "completions/mean_terminated_length": 1226.828125, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "entropy": 0.25837584137916564, + "epoch": 3.2432432432432434, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 1.8124545674824328e-07, + "loss": 0.0002, + "num_tokens": 372891134.0, + "reward": 0.7364583492279053, + "reward_std": 0.0557606402784586, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7364583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3232154339551926, + "sampling/importance_sampling_ratio/max": 1.9625327587127686, + "sampling/importance_sampling_ratio/mean": 1.0000850915908814, + "sampling/importance_sampling_ratio/min": 0.3799369066953659, + "sampling/sampling_logp_difference/max": 1.1325337648391725, + "sampling/sampling_logp_difference/mean": 0.01331845298409462, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1545.6, + "completions/max_terminated_length": 1545.6, + "completions/mean_length": 1169.91875, + "completions/mean_terminated_length": 1169.91875, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.2515501230955124, + "epoch": 3.2491186839012927, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5899607539176941, + "learning_rate": 1.8063968984734674e-07, + "loss": 0.0021, + "num_tokens": 373575492.0, + "reward": 0.8873437523841858, + "reward_std": 0.07848574072122574, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8873437523841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19398850649595262, + "sampling/importance_sampling_ratio/max": 1.8839639186859132, + "sampling/importance_sampling_ratio/mean": 0.9999752283096314, + "sampling/importance_sampling_ratio/min": 0.353549987077713, + "sampling/sampling_logp_difference/max": 1.1025007724761964, + "sampling/sampling_logp_difference/mean": 0.013109220005571843, + "step": 2765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1605.4, + "completions/max_terminated_length": 1605.4, + "completions/mean_length": 1208.865625, + "completions/mean_terminated_length": 1208.865625, + "completions/min_length": 946.2, + "completions/min_terminated_length": 946.2, + "entropy": 0.262452107667923, + "epoch": 3.254994124559342, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6078686714172363, + "learning_rate": 1.800339229464502e-07, + "loss": 0.0051, + "num_tokens": 374303577.0, + "reward": 0.7220833420753479, + "reward_std": 0.07195360362529754, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7220833420753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3346565842628479, + "sampling/importance_sampling_ratio/max": 1.9912936210632324, + "sampling/importance_sampling_ratio/mean": 0.9999467730522156, + "sampling/importance_sampling_ratio/min": 0.319632551074028, + "sampling/sampling_logp_difference/max": 1.1714837074279785, + "sampling/sampling_logp_difference/mean": 0.01357471402734518, + "step": 2770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1565.4, + "completions/max_terminated_length": 1565.4, + "completions/mean_length": 1132.821875, + "completions/mean_terminated_length": 1132.821875, + "completions/min_length": 803.6, + "completions/min_terminated_length": 803.6, + "entropy": 0.2516249448060989, + "epoch": 3.260869565217391, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.715618908405304, + "learning_rate": 1.7942815604555366e-07, + "loss": -0.0009, + "num_tokens": 374955184.0, + "reward": 0.8385416746139527, + "reward_std": 0.056407293677330016, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8385416865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20846658647060395, + "sampling/importance_sampling_ratio/max": 1.827885890007019, + "sampling/importance_sampling_ratio/mean": 1.000091540813446, + "sampling/importance_sampling_ratio/min": 0.37121389210224154, + "sampling/sampling_logp_difference/max": 1.0708453416824342, + "sampling/sampling_logp_difference/mean": 0.013041505217552185, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1532.4, + "completions/max_terminated_length": 1532.4, + "completions/mean_length": 1131.478125, + "completions/mean_terminated_length": 1131.478125, + "completions/min_length": 832.4, + "completions/min_terminated_length": 832.4, + "entropy": 0.24903405606746673, + "epoch": 3.266745005875441, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.7905736565589905, + "learning_rate": 1.7882238914465713e-07, + "loss": 0.0009, + "num_tokens": 375624905.0, + "reward": 0.9010416865348816, + "reward_std": 0.08152148127555847, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9010416865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1657480537891388, + "sampling/importance_sampling_ratio/max": 1.894256854057312, + "sampling/importance_sampling_ratio/mean": 0.9999601244926453, + "sampling/importance_sampling_ratio/min": 0.3288370221853256, + "sampling/sampling_logp_difference/max": 1.1827893733978272, + "sampling/sampling_logp_difference/mean": 0.012998024187982082, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1464.0, + "completions/max_terminated_length": 1464.0, + "completions/mean_length": 1107.759375, + "completions/mean_terminated_length": 1107.759375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.2499726951122284, + "epoch": 3.27262044653349, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.5189323425292969, + "learning_rate": 1.7821662224376062e-07, + "loss": 0.001, + "num_tokens": 376336844.0, + "reward": 0.8536458492279053, + "reward_std": 0.04098552390933037, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8536458492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24642951488494874, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000021255016327, + "sampling/importance_sampling_ratio/min": 0.4128114223480225, + "sampling/sampling_logp_difference/max": 1.009805178642273, + "sampling/sampling_logp_difference/mean": 0.013290311396121978, + "step": 2785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1713.4, + "completions/max_terminated_length": 1711.2, + "completions/mean_length": 1176.153125, + "completions/mean_terminated_length": 1169.3762939453125, + "completions/min_length": 884.4, + "completions/min_terminated_length": 884.4, + "entropy": 0.25248327255249026, + "epoch": 3.2784958871915393, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.42183059453964233, + "learning_rate": 1.7761085534286405e-07, + "loss": -0.0027, + "num_tokens": 377038645.0, + "reward": 0.8496875047683716, + "reward_std": 0.08127287812530995, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8496875047683716, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24507201015949248, + "sampling/importance_sampling_ratio/max": 1.9258476972579956, + "sampling/importance_sampling_ratio/mean": 1.0000847578048706, + "sampling/importance_sampling_ratio/min": 0.3457289457321167, + "sampling/sampling_logp_difference/max": 1.0756362676620483, + "sampling/sampling_logp_difference/mean": 0.013472091406583786, + "step": 2790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1634.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 1164.784375, + "completions/mean_terminated_length": 1164.784375, + "completions/min_length": 835.6, + "completions/min_terminated_length": 835.6, + "entropy": 0.2675493985414505, + "epoch": 3.2843713278495885, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.43047988414764404, + "learning_rate": 1.7700508844196752e-07, + "loss": -0.001, + "num_tokens": 377747424.0, + "reward": 0.8713541746139526, + "reward_std": 0.10082083642482757, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8713541746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27632923126220704, + "sampling/importance_sampling_ratio/max": 1.967568302154541, + "sampling/importance_sampling_ratio/mean": 0.9999858021736145, + "sampling/importance_sampling_ratio/min": 0.3365649715065956, + "sampling/sampling_logp_difference/max": 1.2312506914138794, + "sampling/sampling_logp_difference/mean": 0.013835505396127701, + "step": 2795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1770.2, + "completions/max_terminated_length": 1714.6, + "completions/mean_length": 1174.909375, + "completions/mean_terminated_length": 1171.104638671875, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "entropy": 0.2529326885938644, + "epoch": 3.290246768507638, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.45852166414260864, + "learning_rate": 1.7639932154107098e-07, + "loss": -0.0045, + "num_tokens": 378429455.0, + "reward": 0.8065625309944153, + "reward_std": 0.12179329991340637, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8065625190734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25495838224887846, + "sampling/importance_sampling_ratio/max": 1.95216703414917, + "sampling/importance_sampling_ratio/mean": 0.9999139785766602, + "sampling/importance_sampling_ratio/min": 0.3560261070728302, + "sampling/sampling_logp_difference/max": 1.1213287830352783, + "sampling/sampling_logp_difference/mean": 0.013436216115951537, + "step": 2800 + }, + { + "epoch": 3.290246768507638, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1508.4, + "eval_completions/max_terminated_length": 1508.4, + "eval_completions/mean_length": 1131.9075, + "eval_completions/mean_terminated_length": 1131.9075, + "eval_completions/min_length": 855.72, + "eval_completions/min_terminated_length": 855.72, + "eval_entropy": 0.25337180495262146, + "eval_frac_reward_zero_std": 0.56, + "eval_loss": -0.00035420674248598516, + "eval_num_tokens": 378429455.0, + "eval_reward": 0.7613020920753479, + "eval_reward_std": 0.08451422370970249, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7613020932674408, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2949679693579674, + "eval_runtime": 419.5616, + "eval_samples_per_second": 0.238, + "eval_sampling/importance_sampling_ratio/max": 1.9767439079284668, + "eval_sampling/importance_sampling_ratio/mean": 1.0000990962982177, + "eval_sampling/importance_sampling_ratio/min": 0.31874189218506216, + "eval_sampling/sampling_logp_difference/max": 1.4742623805999755, + "eval_sampling/sampling_logp_difference/mean": 0.013241245038807392, + "eval_steps_per_second": 0.005, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1747.4, + "completions/max_terminated_length": 1747.4, + "completions/mean_length": 1235.028125, + "completions/mean_terminated_length": 1235.028125, + "completions/min_length": 959.8, + "completions/min_terminated_length": 959.8, + "entropy": 0.24830776453018188, + "epoch": 3.2961222091656874, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.3905949592590332, + "learning_rate": 1.7579355464017444e-07, + "loss": 0.0029, + "num_tokens": 379146824.0, + "reward": 0.8643229365348816, + "reward_std": 0.09152939319610595, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8643229365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23323871195316315, + "sampling/importance_sampling_ratio/max": 1.9724785804748535, + "sampling/importance_sampling_ratio/mean": 1.0000085353851318, + "sampling/importance_sampling_ratio/min": 0.34535167515277865, + "sampling/sampling_logp_difference/max": 1.126982283592224, + "sampling/sampling_logp_difference/mean": 0.012922433577477932, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1577.4, + "completions/max_terminated_length": 1577.4, + "completions/mean_length": 1131.065625, + "completions/mean_terminated_length": 1131.065625, + "completions/min_length": 853.4, + "completions/min_terminated_length": 853.4, + "entropy": 0.26412600874900816, + "epoch": 3.3019976498237367, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6829572916030884, + "learning_rate": 1.7518778773927793e-07, + "loss": -0.0053, + "num_tokens": 379835933.0, + "reward": 0.870312511920929, + "reward_std": 0.05251576006412506, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.870312511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22497628554701804, + "sampling/importance_sampling_ratio/max": 1.9401256084442138, + "sampling/importance_sampling_ratio/mean": 1.00009765625, + "sampling/importance_sampling_ratio/min": 0.3727611839771271, + "sampling/sampling_logp_difference/max": 1.0502751111984252, + "sampling/sampling_logp_difference/mean": 0.013848403468728065, + "step": 2810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1655.6, + "completions/max_terminated_length": 1655.6, + "completions/mean_length": 1195.8375, + "completions/mean_terminated_length": 1195.8375, + "completions/min_length": 827.6, + "completions/min_terminated_length": 827.6, + "entropy": 0.24685363173484803, + "epoch": 3.3078730904817863, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6114619970321655, + "learning_rate": 1.745820208383814e-07, + "loss": -0.0025, + "num_tokens": 380550489.0, + "reward": 0.8239583492279052, + "reward_std": 0.09283133745193481, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8239583492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28367829620838164, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999561309814453, + "sampling/importance_sampling_ratio/min": 0.31022571623325346, + "sampling/sampling_logp_difference/max": 1.3538453340530396, + "sampling/sampling_logp_difference/mean": 0.013016180135309696, + "step": 2815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1631.6, + "completions/max_terminated_length": 1631.6, + "completions/mean_length": 1201.109375, + "completions/mean_terminated_length": 1201.109375, + "completions/min_length": 895.0, + "completions/min_terminated_length": 895.0, + "entropy": 0.25497291088104246, + "epoch": 3.3137485311398356, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4095427393913269, + "learning_rate": 1.7397625393748486e-07, + "loss": 0.0055, + "num_tokens": 381267212.0, + "reward": 0.815625011920929, + "reward_std": 0.0877026379108429, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.815625011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2964575469493866, + "sampling/importance_sampling_ratio/max": 1.9889522314071655, + "sampling/importance_sampling_ratio/mean": 0.9999091506004334, + "sampling/importance_sampling_ratio/min": 0.3753842532634735, + "sampling/sampling_logp_difference/max": 1.032554531097412, + "sampling/sampling_logp_difference/mean": 0.013489954732358455, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1559.2, + "completions/max_terminated_length": 1559.2, + "completions/mean_length": 1157.134375, + "completions/mean_terminated_length": 1157.134375, + "completions/min_length": 814.6, + "completions/min_terminated_length": 814.6, + "entropy": 0.25930267572402954, + "epoch": 3.319623971797885, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6701661348342896, + "learning_rate": 1.7337048703658832e-07, + "loss": -0.0002, + "num_tokens": 381964775.0, + "reward": 0.8198958396911621, + "reward_std": 0.106954687833786, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8198958396911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2636798769235611, + "sampling/importance_sampling_ratio/max": 1.962524652481079, + "sampling/importance_sampling_ratio/mean": 1.0000115036964417, + "sampling/importance_sampling_ratio/min": 0.23018737211823465, + "sampling/sampling_logp_difference/max": 1.7914230823516846, + "sampling/sampling_logp_difference/mean": 0.013436510972678662, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1636.4, + "completions/max_terminated_length": 1636.4, + "completions/mean_length": 1192.090625, + "completions/mean_terminated_length": 1192.090625, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.259701132774353, + "epoch": 3.325499412455934, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5898331999778748, + "learning_rate": 1.7276472013569176e-07, + "loss": -0.0043, + "num_tokens": 382688948.0, + "reward": 0.7366666793823242, + "reward_std": 0.09564396217465401, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7366666793823242, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31994677186012266, + "sampling/importance_sampling_ratio/max": 1.8907454490661622, + "sampling/importance_sampling_ratio/mean": 0.9998792886734009, + "sampling/importance_sampling_ratio/min": 0.35978134274482726, + "sampling/sampling_logp_difference/max": 1.0968676328659057, + "sampling/sampling_logp_difference/mean": 0.013501750491559505, + "step": 2830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1637.4, + "completions/max_terminated_length": 1637.4, + "completions/mean_length": 1182.41875, + "completions/mean_terminated_length": 1182.41875, + "completions/min_length": 785.6, + "completions/min_terminated_length": 785.6, + "entropy": 0.23543883562088014, + "epoch": 3.3313748531139837, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5775417685508728, + "learning_rate": 1.7215895323479524e-07, + "loss": 0.0014, + "num_tokens": 383374234.0, + "reward": 0.8911458492279053, + "reward_std": 0.11471164971590042, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8911458492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20149447470903398, + "sampling/importance_sampling_ratio/max": 1.8971025705337525, + "sampling/importance_sampling_ratio/mean": 1.0000097513198853, + "sampling/importance_sampling_ratio/min": 0.38829224109649657, + "sampling/sampling_logp_difference/max": 1.031450629234314, + "sampling/sampling_logp_difference/mean": 0.012517093122005463, + "step": 2835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1589.0, + "completions/max_terminated_length": 1589.0, + "completions/mean_length": 1181.99375, + "completions/mean_terminated_length": 1181.99375, + "completions/min_length": 906.2, + "completions/min_terminated_length": 906.2, + "entropy": 0.26151891946792605, + "epoch": 3.337250293772033, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4710766673088074, + "learning_rate": 1.715531863338987e-07, + "loss": -0.002, + "num_tokens": 384075800.0, + "reward": 0.7330729365348816, + "reward_std": 0.09073313027620315, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7330729365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3040100812911987, + "sampling/importance_sampling_ratio/max": 1.8934704780578613, + "sampling/importance_sampling_ratio/mean": 1.000049901008606, + "sampling/importance_sampling_ratio/min": 0.3532286584377289, + "sampling/sampling_logp_difference/max": 1.0922445774078369, + "sampling/sampling_logp_difference/mean": 0.013711506128311157, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1627.6, + "completions/max_terminated_length": 1627.6, + "completions/mean_length": 1227.815625, + "completions/mean_terminated_length": 1227.815625, + "completions/min_length": 944.4, + "completions/min_terminated_length": 944.4, + "entropy": 0.2659549415111542, + "epoch": 3.343125734430082, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 1.7094741943300217e-07, + "loss": 0.0042, + "num_tokens": 384780157.0, + "reward": 0.8932291746139527, + "reward_std": 0.06313644722104073, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8932291746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19119044840335847, + "sampling/importance_sampling_ratio/max": 1.9835964679718017, + "sampling/importance_sampling_ratio/mean": 1.000018262863159, + "sampling/importance_sampling_ratio/min": 0.35918577909469607, + "sampling/sampling_logp_difference/max": 1.0723339557647704, + "sampling/sampling_logp_difference/mean": 0.013455265015363694, + "step": 2845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1667.8, + "completions/max_terminated_length": 1667.8, + "completions/mean_length": 1186.8, + "completions/mean_terminated_length": 1186.8, + "completions/min_length": 892.2, + "completions/min_terminated_length": 892.2, + "entropy": 0.26246914863586424, + "epoch": 3.3490011750881314, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5709245204925537, + "learning_rate": 1.7034165253210563e-07, + "loss": -0.0035, + "num_tokens": 385461997.0, + "reward": 0.8755208373069763, + "reward_std": 0.09409238025546074, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8755208373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2078237384557724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000224113464355, + "sampling/importance_sampling_ratio/min": 0.29794262945652006, + "sampling/sampling_logp_difference/max": 1.3214488983154298, + "sampling/sampling_logp_difference/mean": 0.013476687669754028, + "step": 2850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1621.0, + "completions/max_terminated_length": 1621.0, + "completions/mean_length": 1135.009375, + "completions/mean_terminated_length": 1135.009375, + "completions/min_length": 795.6, + "completions/min_terminated_length": 795.6, + "entropy": 0.2704127460718155, + "epoch": 3.354876615746181, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6261325478553772, + "learning_rate": 1.697358856312091e-07, + "loss": 0.0004, + "num_tokens": 386161392.0, + "reward": 0.8057291984558106, + "reward_std": 0.04726305603981018, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8057291984558106, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2478840470314026, + "sampling/importance_sampling_ratio/max": 1.988915705680847, + "sampling/importance_sampling_ratio/mean": 1.000031304359436, + "sampling/importance_sampling_ratio/min": 0.363059838116169, + "sampling/sampling_logp_difference/max": 1.249585008621216, + "sampling/sampling_logp_difference/mean": 0.013962916098535061, + "step": 2855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1632.8, + "completions/max_terminated_length": 1632.8, + "completions/mean_length": 1203.0375, + "completions/mean_terminated_length": 1203.0375, + "completions/min_length": 960.2, + "completions/min_terminated_length": 960.2, + "entropy": 0.26425559520721437, + "epoch": 3.3607520564042304, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.3815658986568451, + "learning_rate": 1.6913011873031259e-07, + "loss": -0.0073, + "num_tokens": 386859852.0, + "reward": 0.8492187619209289, + "reward_std": 0.07277037352323532, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8492187619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2327181279659271, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999239444732666, + "sampling/importance_sampling_ratio/min": 0.2944484859704971, + "sampling/sampling_logp_difference/max": 1.2413968324661255, + "sampling/sampling_logp_difference/mean": 0.013403261639177799, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1474.4, + "completions/max_terminated_length": 1474.4, + "completions/mean_length": 1176.075, + "completions/mean_terminated_length": 1176.075, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.2643334299325943, + "epoch": 3.3666274970622796, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6973140835762024, + "learning_rate": 1.6852435182941605e-07, + "loss": 0.0031, + "num_tokens": 387544932.0, + "reward": 0.746875, + "reward_std": 0.0624865785241127, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.746875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2990709781646729, + "sampling/importance_sampling_ratio/max": 1.9863412141799928, + "sampling/importance_sampling_ratio/mean": 1.000007724761963, + "sampling/importance_sampling_ratio/min": 0.39941216707229615, + "sampling/sampling_logp_difference/max": 0.9842625856399536, + "sampling/sampling_logp_difference/mean": 0.013473628833889962, + "step": 2865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 1162.63125, + "completions/mean_terminated_length": 1162.63125, + "completions/min_length": 848.6, + "completions/min_terminated_length": 848.6, + "entropy": 0.26365244686603545, + "epoch": 3.372502937720329, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6711568236351013, + "learning_rate": 1.6791858492851948e-07, + "loss": -0.003, + "num_tokens": 388234830.0, + "reward": 0.8809895992279053, + "reward_std": 0.053984729945659636, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8809895992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16809427738189697, + "sampling/importance_sampling_ratio/max": 1.95412278175354, + "sampling/importance_sampling_ratio/mean": 1.0000274538993836, + "sampling/importance_sampling_ratio/min": 0.40029398798942567, + "sampling/sampling_logp_difference/max": 0.9596251726150513, + "sampling/sampling_logp_difference/mean": 0.01345615666359663, + "step": 2870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1530.4, + "completions/max_terminated_length": 1530.4, + "completions/mean_length": 1171.425, + "completions/mean_terminated_length": 1171.425, + "completions/min_length": 896.2, + "completions/min_terminated_length": 896.2, + "entropy": 0.25766043066978456, + "epoch": 3.3783783783783785, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5919622182846069, + "learning_rate": 1.6731281802762295e-07, + "loss": 0.0019, + "num_tokens": 388903718.0, + "reward": 0.9229166865348816, + "reward_std": 0.06342011243104935, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9229166865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16442330777645112, + "sampling/importance_sampling_ratio/max": 1.9627609968185424, + "sampling/importance_sampling_ratio/mean": 1.000002133846283, + "sampling/importance_sampling_ratio/min": 0.29471515119075775, + "sampling/sampling_logp_difference/max": 1.2630483984947205, + "sampling/sampling_logp_difference/mean": 0.013212603889405728, + "step": 2875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1785.8, + "completions/max_terminated_length": 1785.8, + "completions/mean_length": 1277.615625, + "completions/mean_terminated_length": 1277.615625, + "completions/min_length": 892.4, + "completions/min_terminated_length": 892.4, + "entropy": 0.2642282694578171, + "epoch": 3.3842538190364277, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 1.667070511267264e-07, + "loss": 0.0003, + "num_tokens": 389659083.0, + "reward": 0.8364583492279053, + "reward_std": 0.07245562374591827, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8364583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22363150119781494, + "sampling/importance_sampling_ratio/max": 1.9598698139190673, + "sampling/importance_sampling_ratio/mean": 0.99994957447052, + "sampling/importance_sampling_ratio/min": 0.3109358698129654, + "sampling/sampling_logp_difference/max": 1.2494572162628175, + "sampling/sampling_logp_difference/mean": 0.01340102069079876, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1564.4, + "completions/max_terminated_length": 1564.4, + "completions/mean_length": 1184.853125, + "completions/mean_terminated_length": 1184.853125, + "completions/min_length": 883.6, + "completions/min_terminated_length": 883.6, + "entropy": 0.2694912374019623, + "epoch": 3.390129259694477, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.41767993569374084, + "learning_rate": 1.661012842258299e-07, + "loss": -0.0038, + "num_tokens": 390364700.0, + "reward": 0.8158854365348815, + "reward_std": 0.06047187112271786, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8158854365348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29825079143047334, + "sampling/importance_sampling_ratio/max": 1.983485460281372, + "sampling/importance_sampling_ratio/mean": 1.000064730644226, + "sampling/importance_sampling_ratio/min": 0.3307294547557831, + "sampling/sampling_logp_difference/max": 1.4649325132369995, + "sampling/sampling_logp_difference/mean": 0.01370826605707407, + "step": 2885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1892.6, + "completions/max_terminated_length": 1892.6, + "completions/mean_length": 1323.515625, + "completions/mean_terminated_length": 1323.515625, + "completions/min_length": 993.6, + "completions/min_terminated_length": 993.6, + "entropy": 0.28171729743480683, + "epoch": 3.3960047003525267, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5330216884613037, + "learning_rate": 1.6549551732493336e-07, + "loss": -0.0033, + "num_tokens": 391118577.0, + "reward": 0.7697916746139526, + "reward_std": 0.0524015374481678, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7697916746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27035410702228546, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999969756603241, + "sampling/importance_sampling_ratio/min": 0.24186867326498032, + "sampling/sampling_logp_difference/max": 1.6836973428726196, + "sampling/sampling_logp_difference/mean": 0.013766255043447018, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1748.2, + "completions/max_terminated_length": 1748.2, + "completions/mean_length": 1213.021875, + "completions/mean_terminated_length": 1213.021875, + "completions/min_length": 896.6, + "completions/min_terminated_length": 896.6, + "entropy": 0.29188904762268064, + "epoch": 3.401880141010576, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6556338667869568, + "learning_rate": 1.6488975042403683e-07, + "loss": -0.0004, + "num_tokens": 391879512.0, + "reward": 0.8236458420753479, + "reward_std": 0.09013433307409287, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8236458420753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26258903443813325, + "sampling/importance_sampling_ratio/max": 1.9666601896286011, + "sampling/importance_sampling_ratio/mean": 0.9999792337417602, + "sampling/importance_sampling_ratio/min": 0.34474579691886903, + "sampling/sampling_logp_difference/max": 1.1234654664993287, + "sampling/sampling_logp_difference/mean": 0.014820769429206848, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1670.0, + "completions/max_terminated_length": 1670.0, + "completions/mean_length": 1213.39375, + "completions/mean_terminated_length": 1213.39375, + "completions/min_length": 897.6, + "completions/min_terminated_length": 897.6, + "entropy": 0.28398237824440004, + "epoch": 3.407755581668625, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.0, + "learning_rate": 1.642839835231403e-07, + "loss": -0.0024, + "num_tokens": 392588134.0, + "reward": 0.8369791746139527, + "reward_std": 0.045029500126838685, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8369791746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28497442603111267, + "sampling/importance_sampling_ratio/max": 1.9755486488342284, + "sampling/importance_sampling_ratio/mean": 0.9999412894248962, + "sampling/importance_sampling_ratio/min": 0.37593441605567934, + "sampling/sampling_logp_difference/max": 1.0409663915634155, + "sampling/sampling_logp_difference/mean": 0.01409566793590784, + "step": 2900 + }, + { + "epoch": 3.407755581668625, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1682.08, + "eval_completions/max_terminated_length": 1682.08, + "eval_completions/mean_length": 1202.545, + "eval_completions/mean_terminated_length": 1202.545, + "eval_completions/min_length": 904.52, + "eval_completions/min_terminated_length": 904.52, + "eval_entropy": 0.2751586544513702, + "eval_frac_reward_zero_std": 0.64, + "eval_loss": 0.004005698952823877, + "eval_num_tokens": 392588134.0, + "eval_reward": 0.7671562600135803, + "eval_reward_std": 0.07342575185000896, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7671562600135803, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2901870983839035, + "eval_runtime": 462.7739, + "eval_samples_per_second": 0.216, + "eval_sampling/importance_sampling_ratio/max": 1.9416738891601562, + "eval_sampling/importance_sampling_ratio/mean": 0.9999853873252869, + "eval_sampling/importance_sampling_ratio/min": 0.3018297159723193, + "eval_sampling/sampling_logp_difference/max": 2.2107747268676756, + "eval_sampling/sampling_logp_difference/mean": 0.013962351121008396, + "eval_steps_per_second": 0.004, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.6, + "completions/max_terminated_length": 1622.6, + "completions/mean_length": 1219.08125, + "completions/mean_terminated_length": 1219.08125, + "completions/min_length": 910.8, + "completions/min_terminated_length": 910.8, + "entropy": 0.27651492953300477, + "epoch": 3.4136310223266744, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.523073673248291, + "learning_rate": 1.6367821662224375e-07, + "loss": -0.0013, + "num_tokens": 393292144.0, + "reward": 0.8482812643051147, + "reward_std": 0.04793854169547558, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8482812643051147, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28270514160394666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999916076660156, + "sampling/importance_sampling_ratio/min": 0.2504092216491699, + "sampling/sampling_logp_difference/max": 1.4600980758666993, + "sampling/sampling_logp_difference/mean": 0.014081763848662377, + "step": 2905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1609.6, + "completions/max_terminated_length": 1609.6, + "completions/mean_length": 1176.940625, + "completions/mean_terminated_length": 1176.940625, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "entropy": 0.2731807053089142, + "epoch": 3.4195064629847236, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.43564677238464355, + "learning_rate": 1.6307244972134721e-07, + "loss": -0.0022, + "num_tokens": 393994445.0, + "reward": 0.8734375, + "reward_std": 0.08320709615945816, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8734375, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20147269070148469, + "sampling/importance_sampling_ratio/max": 1.970658278465271, + "sampling/importance_sampling_ratio/mean": 1.0000769376754761, + "sampling/importance_sampling_ratio/min": 0.4103114724159241, + "sampling/sampling_logp_difference/max": 0.9029069662094116, + "sampling/sampling_logp_difference/mean": 0.013669577986001968, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1716.2, + "completions/max_terminated_length": 1716.2, + "completions/mean_length": 1238.3, + "completions/mean_terminated_length": 1238.3, + "completions/min_length": 924.8, + "completions/min_terminated_length": 924.8, + "entropy": 0.27329388856887815, + "epoch": 3.4253819036427733, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6953060030937195, + "learning_rate": 1.6246668282045068e-07, + "loss": -0.0019, + "num_tokens": 394732077.0, + "reward": 0.9040104269981384, + "reward_std": 0.042052581906318665, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9040104269981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17316112592816352, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999583959579468, + "sampling/importance_sampling_ratio/min": 0.33822412192821505, + "sampling/sampling_logp_difference/max": 1.1926729202270507, + "sampling/sampling_logp_difference/mean": 0.013831990212202072, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1761.4, + "completions/max_terminated_length": 1761.4, + "completions/mean_length": 1285.715625, + "completions/mean_terminated_length": 1285.715625, + "completions/min_length": 906.4, + "completions/min_terminated_length": 906.4, + "entropy": 0.2749119311571121, + "epoch": 3.4312573443008225, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4019206762313843, + "learning_rate": 1.6186091591955414e-07, + "loss": -0.0004, + "num_tokens": 395468258.0, + "reward": 0.8369791746139527, + "reward_std": 0.10039222538471222, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8369791746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24194913506507873, + "sampling/importance_sampling_ratio/max": 1.9278610944747925, + "sampling/importance_sampling_ratio/mean": 1.000080668926239, + "sampling/importance_sampling_ratio/min": 0.30234331358224154, + "sampling/sampling_logp_difference/max": 1.7760730504989624, + "sampling/sampling_logp_difference/mean": 0.013776698522269726, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1701.4, + "completions/max_terminated_length": 1701.4, + "completions/mean_length": 1198.328125, + "completions/mean_terminated_length": 1198.328125, + "completions/min_length": 898.6, + "completions/min_terminated_length": 898.6, + "entropy": 0.25736156702041624, + "epoch": 3.4371327849588718, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6544657945632935, + "learning_rate": 1.612551490186576e-07, + "loss": 0.0055, + "num_tokens": 396206859.0, + "reward": 0.8437500119209289, + "reward_std": 0.08604325987398624, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8437500119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2119861736893654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999913215637207, + "sampling/importance_sampling_ratio/min": 0.27694354951381683, + "sampling/sampling_logp_difference/max": 1.3848939895629884, + "sampling/sampling_logp_difference/mean": 0.013181288540363312, + "step": 2925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1950.6, + "completions/max_terminated_length": 1864.6, + "completions/mean_length": 1320.728125, + "completions/mean_terminated_length": 1317.262109375, + "completions/min_length": 933.6, + "completions/min_terminated_length": 933.6, + "entropy": 0.2831844985485077, + "epoch": 3.4430082256169214, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4558798670768738, + "learning_rate": 1.6064938211776107e-07, + "loss": -0.0071, + "num_tokens": 396956608.0, + "reward": 0.789843761920929, + "reward_std": 0.09386796951293945, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.789843761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28504001498222353, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999266624450683, + "sampling/importance_sampling_ratio/min": 0.26929228343069556, + "sampling/sampling_logp_difference/max": 1.7527684926986695, + "sampling/sampling_logp_difference/mean": 0.01404192615300417, + "step": 2930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1707.6, + "completions/max_terminated_length": 1707.6, + "completions/mean_length": 1202.7375, + "completions/mean_terminated_length": 1202.7375, + "completions/min_length": 831.8, + "completions/min_terminated_length": 831.8, + "entropy": 0.2845179855823517, + "epoch": 3.4488836662749707, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6161671280860901, + "learning_rate": 1.6004361521686456e-07, + "loss": -0.0046, + "num_tokens": 397661164.0, + "reward": 0.8372395992279053, + "reward_std": 0.08098205551505089, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8372396051883697, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23883938044309616, + "sampling/importance_sampling_ratio/max": 1.922814679145813, + "sampling/importance_sampling_ratio/mean": 1.0000414729118348, + "sampling/importance_sampling_ratio/min": 0.34642059803009034, + "sampling/sampling_logp_difference/max": 1.499940037727356, + "sampling/sampling_logp_difference/mean": 0.014074294827878475, + "step": 2935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1735.0, + "completions/max_terminated_length": 1735.0, + "completions/mean_length": 1231.615625, + "completions/mean_terminated_length": 1231.615625, + "completions/min_length": 959.6, + "completions/min_terminated_length": 959.6, + "entropy": 0.26078474819660186, + "epoch": 3.45475910693302, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.43476438522338867, + "learning_rate": 1.5943784831596802e-07, + "loss": 0.003, + "num_tokens": 398367009.0, + "reward": 0.8515625119209289, + "reward_std": 0.08800038211047649, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8515625119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.240531849861145, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999879479408265, + "sampling/importance_sampling_ratio/min": 0.38882568180561067, + "sampling/sampling_logp_difference/max": 1.0931127309799193, + "sampling/sampling_logp_difference/mean": 0.013227501325309277, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 1271.440625, + "completions/mean_terminated_length": 1271.440625, + "completions/min_length": 914.0, + "completions/min_terminated_length": 914.0, + "entropy": 0.28265722990036013, + "epoch": 3.460634547591069, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.43009883165359497, + "learning_rate": 1.5883208141507148e-07, + "loss": 0.0029, + "num_tokens": 399059678.0, + "reward": 0.8828125119209289, + "reward_std": 0.04852218851447106, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8828125119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.15546741783618928, + "sampling/importance_sampling_ratio/max": 1.891363549232483, + "sampling/importance_sampling_ratio/mean": 0.9999841690063477, + "sampling/importance_sampling_ratio/min": 0.4459798693656921, + "sampling/sampling_logp_difference/max": 0.8462659478187561, + "sampling/sampling_logp_difference/mean": 0.014082801155745983, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1823.4, + "completions/max_terminated_length": 1823.4, + "completions/mean_length": 1283.26875, + "completions/mean_terminated_length": 1283.26875, + "completions/min_length": 919.8, + "completions/min_terminated_length": 919.8, + "entropy": 0.28141863346099855, + "epoch": 3.466509988249119, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6030286550521851, + "learning_rate": 1.5822631451417492e-07, + "loss": 0.0095, + "num_tokens": 399779444.0, + "reward": 0.7938541889190673, + "reward_std": 0.10225731804966927, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7938541889190673, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25199161395430564, + "sampling/importance_sampling_ratio/max": 1.995455813407898, + "sampling/importance_sampling_ratio/mean": 1.0000591158866883, + "sampling/importance_sampling_ratio/min": 0.35674205124378205, + "sampling/sampling_logp_difference/max": 1.0800941228866576, + "sampling/sampling_logp_difference/mean": 0.013914234191179275, + "step": 2950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.4, + "completions/max_terminated_length": 1739.4, + "completions/mean_length": 1261.48125, + "completions/mean_terminated_length": 1261.48125, + "completions/min_length": 909.8, + "completions/min_terminated_length": 909.8, + "entropy": 0.2674350649118423, + "epoch": 3.472385428907168, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 1.5762054761327838e-07, + "loss": 0.0024, + "num_tokens": 400484766.0, + "reward": 0.9380208492279053, + "reward_std": 0.04721375107765198, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9380208492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1494040608406067, + "sampling/importance_sampling_ratio/max": 1.9453770637512207, + "sampling/importance_sampling_ratio/mean": 1.0000848650932312, + "sampling/importance_sampling_ratio/min": 0.26389922499656676, + "sampling/sampling_logp_difference/max": 1.8547302961349488, + "sampling/sampling_logp_difference/mean": 0.013454168103635311, + "step": 2955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1712.4, + "completions/max_terminated_length": 1712.4, + "completions/mean_length": 1213.08125, + "completions/mean_terminated_length": 1213.08125, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.25921439528465273, + "epoch": 3.4782608695652173, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.7261385917663574, + "learning_rate": 1.5701478071238187e-07, + "loss": 0.0014, + "num_tokens": 401193560.0, + "reward": 0.9666666746139526, + "reward_std": 0.04748408943414688, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9666666746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.08946052193641663, + "sampling/importance_sampling_ratio/max": 1.9423861503601074, + "sampling/importance_sampling_ratio/mean": 1.000016164779663, + "sampling/importance_sampling_ratio/min": 0.3426270544528961, + "sampling/sampling_logp_difference/max": 1.0901034355163575, + "sampling/sampling_logp_difference/mean": 0.013289996609091758, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1657.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 1260.8875, + "completions/mean_terminated_length": 1260.8875, + "completions/min_length": 949.2, + "completions/min_terminated_length": 949.2, + "entropy": 0.28407627046108247, + "epoch": 3.484136310223267, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.5941851139068604, + "learning_rate": 1.5640901381148533e-07, + "loss": 0.0043, + "num_tokens": 401933636.0, + "reward": 0.93125, + "reward_std": 0.03618033975362778, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.93125, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13101719617843627, + "sampling/importance_sampling_ratio/max": 1.919576144218445, + "sampling/importance_sampling_ratio/mean": 0.9998687863349914, + "sampling/importance_sampling_ratio/min": 0.36393317878246306, + "sampling/sampling_logp_difference/max": 1.128061878681183, + "sampling/sampling_logp_difference/mean": 0.014223051071166993, + "step": 2965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1554.4, + "completions/max_terminated_length": 1554.4, + "completions/mean_length": 1201.296875, + "completions/mean_terminated_length": 1201.296875, + "completions/min_length": 940.8, + "completions/min_terminated_length": 940.8, + "entropy": 0.2826466590166092, + "epoch": 3.490011750881316, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.749692976474762, + "learning_rate": 1.558032469105888e-07, + "loss": -0.0038, + "num_tokens": 402682451.0, + "reward": 0.8328125, + "reward_std": 0.06184412464499474, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8328125, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2695111095905304, + "sampling/importance_sampling_ratio/max": 1.8911922693252563, + "sampling/importance_sampling_ratio/mean": 0.9998687863349914, + "sampling/importance_sampling_ratio/min": 0.31807301938533783, + "sampling/sampling_logp_difference/max": 1.2873031139373778, + "sampling/sampling_logp_difference/mean": 0.014129643328487873, + "step": 2970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1598.6, + "completions/max_terminated_length": 1598.6, + "completions/mean_length": 1208.403125, + "completions/mean_terminated_length": 1208.403125, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.2606518566608429, + "epoch": 3.4958871915393654, + "frac_reward_zero_std": 0.6, + "grad_norm": 2.265922784805298, + "learning_rate": 1.5519748000969226e-07, + "loss": 0.0057, + "num_tokens": 403390836.0, + "reward": 0.9125000238418579, + "reward_std": 0.08965013474225998, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9125000238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.15533160120248796, + "sampling/importance_sampling_ratio/max": 1.9447248697280883, + "sampling/importance_sampling_ratio/mean": 1.0000758647918702, + "sampling/importance_sampling_ratio/min": 0.31950002945959566, + "sampling/sampling_logp_difference/max": 1.5232036113739014, + "sampling/sampling_logp_difference/mean": 0.013233417831361293, + "step": 2975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1818.2, + "completions/max_terminated_length": 1818.2, + "completions/mean_length": 1310.58125, + "completions/mean_terminated_length": 1310.58125, + "completions/min_length": 1002.6, + "completions/min_terminated_length": 1002.6, + "entropy": 0.2939403593540192, + "epoch": 3.5017626321974147, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6451891660690308, + "learning_rate": 1.5459171310879575e-07, + "loss": 0.0034, + "num_tokens": 404117470.0, + "reward": 0.8412500023841858, + "reward_std": 0.09634547531604767, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8412500023841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27206834256649015, + "sampling/importance_sampling_ratio/max": 1.98237042427063, + "sampling/importance_sampling_ratio/mean": 1.0001010060310365, + "sampling/importance_sampling_ratio/min": 0.3473032474517822, + "sampling/sampling_logp_difference/max": 1.2496708631515503, + "sampling/sampling_logp_difference/mean": 0.01439862884581089, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1741.4, + "completions/max_terminated_length": 1741.4, + "completions/mean_length": 1230.04375, + "completions/mean_terminated_length": 1230.04375, + "completions/min_length": 934.8, + "completions/min_terminated_length": 934.8, + "entropy": 0.2777364790439606, + "epoch": 3.507638072855464, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.47499480843544006, + "learning_rate": 1.539859462078992e-07, + "loss": -0.0007, + "num_tokens": 404824988.0, + "reward": 0.8776041746139527, + "reward_std": 0.1006715402007103, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8776041746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20324081927537918, + "sampling/importance_sampling_ratio/max": 1.9307854890823364, + "sampling/importance_sampling_ratio/mean": 1.0000447750091552, + "sampling/importance_sampling_ratio/min": 0.305920846760273, + "sampling/sampling_logp_difference/max": 1.9527714014053346, + "sampling/sampling_logp_difference/mean": 0.013740032538771629, + "step": 2985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1687.8, + "completions/max_terminated_length": 1687.8, + "completions/mean_length": 1252.390625, + "completions/mean_terminated_length": 1252.390625, + "completions/min_length": 951.0, + "completions/min_terminated_length": 951.0, + "entropy": 0.27992117404937744, + "epoch": 3.5135135135135136, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0, + "learning_rate": 1.5338017930700267e-07, + "loss": 0.0011, + "num_tokens": 405562777.0, + "reward": 0.7064583420753479, + "reward_std": 0.07313631623983383, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7064583420753479, + "rewards/e2e_recall_precision_mixed_reward/std": 0.37596028447151186, + "sampling/importance_sampling_ratio/max": 1.9501430988311768, + "sampling/importance_sampling_ratio/mean": 0.9999474048614502, + "sampling/importance_sampling_ratio/min": 0.392292720079422, + "sampling/sampling_logp_difference/max": 1.0178369522094726, + "sampling/sampling_logp_difference/mean": 0.014073196426033973, + "step": 2990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1640.4, + "completions/max_terminated_length": 1640.4, + "completions/mean_length": 1195.1, + "completions/mean_terminated_length": 1195.1, + "completions/min_length": 846.8, + "completions/min_terminated_length": 846.8, + "entropy": 0.2721045553684235, + "epoch": 3.519388954171563, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.31242266297340393, + "learning_rate": 1.527744124061061e-07, + "loss": -0.0014, + "num_tokens": 406274617.0, + "reward": 0.8744791746139526, + "reward_std": 0.06375965140759945, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8744791746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21375257074832915, + "sampling/importance_sampling_ratio/max": 1.9984694957733153, + "sampling/importance_sampling_ratio/mean": 1.0000406622886657, + "sampling/importance_sampling_ratio/min": 0.3253972053527832, + "sampling/sampling_logp_difference/max": 1.2722053050994873, + "sampling/sampling_logp_difference/mean": 0.013604764640331269, + "step": 2995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1595.6, + "completions/max_terminated_length": 1595.6, + "completions/mean_length": 1300.559375, + "completions/mean_terminated_length": 1300.559375, + "completions/min_length": 1036.8, + "completions/min_terminated_length": 1036.8, + "entropy": 0.2830276906490326, + "epoch": 3.525264394829612, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.33567795157432556, + "learning_rate": 1.5216864550520957e-07, + "loss": -0.0012, + "num_tokens": 406998252.0, + "reward": 0.91015625, + "reward_std": 0.08269033730030059, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.91015625, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18473040610551833, + "sampling/importance_sampling_ratio/max": 1.908703327178955, + "sampling/importance_sampling_ratio/mean": 1.000068771839142, + "sampling/importance_sampling_ratio/min": 0.4005335092544556, + "sampling/sampling_logp_difference/max": 0.9637721180915833, + "sampling/sampling_logp_difference/mean": 0.013768712431192398, + "step": 3000 + }, + { + "epoch": 3.525264394829612, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1676.92, + "eval_completions/max_terminated_length": 1676.92, + "eval_completions/mean_length": 1212.78375, + "eval_completions/mean_terminated_length": 1212.78375, + "eval_completions/min_length": 909.88, + "eval_completions/min_terminated_length": 909.88, + "eval_entropy": 0.2812558990716934, + "eval_frac_reward_zero_std": 0.6, + "eval_loss": 0.004141798242926598, + "eval_num_tokens": 406998252.0, + "eval_reward": 0.7675416767597198, + "eval_reward_std": 0.07886279493570328, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7675416767597198, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.297162811756134, + "eval_runtime": 460.2742, + "eval_samples_per_second": 0.217, + "eval_sampling/importance_sampling_ratio/max": 1.9591137409210204, + "eval_sampling/importance_sampling_ratio/mean": 1.0000136399269104, + "eval_sampling/importance_sampling_ratio/min": 0.34440110325813295, + "eval_sampling/sampling_logp_difference/max": 1.1788950943946839, + "eval_sampling/sampling_logp_difference/mean": 0.013986198827624322, + "eval_steps_per_second": 0.004, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1866.8, + "completions/max_terminated_length": 1866.8, + "completions/mean_length": 1259.075, + "completions/mean_terminated_length": 1259.075, + "completions/min_length": 962.8, + "completions/min_terminated_length": 962.8, + "entropy": 0.2702631801366806, + "epoch": 3.5311398354876617, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5860521793365479, + "learning_rate": 1.5156287860431306e-07, + "loss": 0.0013, + "num_tokens": 407717204.0, + "reward": 0.8221354365348816, + "reward_std": 0.10383418351411819, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8221354365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27810312807559967, + "sampling/importance_sampling_ratio/max": 1.9322570323944093, + "sampling/importance_sampling_ratio/mean": 1.000062108039856, + "sampling/importance_sampling_ratio/min": 0.40129616260528567, + "sampling/sampling_logp_difference/max": 1.0225771427154542, + "sampling/sampling_logp_difference/mean": 0.013545482978224754, + "step": 3005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1716.8, + "completions/max_terminated_length": 1716.8, + "completions/mean_length": 1246.384375, + "completions/mean_terminated_length": 1246.384375, + "completions/min_length": 847.8, + "completions/min_terminated_length": 847.8, + "entropy": 0.2856502890586853, + "epoch": 3.537015276145711, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5813427567481995, + "learning_rate": 1.5095711170341652e-07, + "loss": 0.0047, + "num_tokens": 408432751.0, + "reward": 0.8304687738418579, + "reward_std": 0.11719481796026229, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8304687738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27664896547794343, + "sampling/importance_sampling_ratio/max": 1.9686694860458374, + "sampling/importance_sampling_ratio/mean": 1.0000694751739503, + "sampling/importance_sampling_ratio/min": 0.31002587229013445, + "sampling/sampling_logp_difference/max": 1.3398897409439088, + "sampling/sampling_logp_difference/mean": 0.014048396609723568, + "step": 3010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1645.0, + "completions/max_terminated_length": 1645.0, + "completions/mean_length": 1196.290625, + "completions/mean_terminated_length": 1196.290625, + "completions/min_length": 942.4, + "completions/min_terminated_length": 942.4, + "entropy": 0.25909319818019866, + "epoch": 3.54289071680376, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 1.5035134480252e-07, + "loss": -0.0012, + "num_tokens": 409130076.0, + "reward": 0.9203125238418579, + "reward_std": 0.04663856625556946, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9203125238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13417750522494315, + "sampling/importance_sampling_ratio/max": 1.9461827516555785, + "sampling/importance_sampling_ratio/mean": 0.9999265551567078, + "sampling/importance_sampling_ratio/min": 0.4522597312927246, + "sampling/sampling_logp_difference/max": 0.8987068176269531, + "sampling/sampling_logp_difference/mean": 0.012898856587707997, + "step": 3015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1866.8, + "completions/max_terminated_length": 1866.8, + "completions/mean_length": 1327.4375, + "completions/mean_terminated_length": 1327.4375, + "completions/min_length": 1032.6, + "completions/min_terminated_length": 1032.6, + "entropy": 0.2696498155593872, + "epoch": 3.5487661574618095, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.5800609588623047, + "learning_rate": 1.4974557790162345e-07, + "loss": 0.0039, + "num_tokens": 409860904.0, + "reward": 0.8716145992279053, + "reward_std": 0.10067193508148194, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8716145992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19466981887817383, + "sampling/importance_sampling_ratio/max": 1.9492676258087158, + "sampling/importance_sampling_ratio/mean": 1.0000947833061218, + "sampling/importance_sampling_ratio/min": 0.3709883391857147, + "sampling/sampling_logp_difference/max": 1.1085660457611084, + "sampling/sampling_logp_difference/mean": 0.013166314736008645, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1770.6, + "completions/max_terminated_length": 1770.6, + "completions/mean_length": 1211.534375, + "completions/mean_terminated_length": 1211.534375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.2818950593471527, + "epoch": 3.554641598119859, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.4593271315097809, + "learning_rate": 1.491398110007269e-07, + "loss": -0.0063, + "num_tokens": 410592131.0, + "reward": 0.8971354365348816, + "reward_std": 0.09658310562372208, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8971354365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19376842826604843, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000097036361695, + "sampling/importance_sampling_ratio/min": 0.3906724154949188, + "sampling/sampling_logp_difference/max": 0.9618961811065674, + "sampling/sampling_logp_difference/mean": 0.014005866460502147, + "step": 3025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1513.6, + "completions/max_terminated_length": 1513.6, + "completions/mean_length": 1179.8375, + "completions/mean_terminated_length": 1179.8375, + "completions/min_length": 872.2, + "completions/min_terminated_length": 872.2, + "entropy": 0.25830590128898623, + "epoch": 3.5605170387779084, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4542413353919983, + "learning_rate": 1.485340440998304e-07, + "loss": -0.0012, + "num_tokens": 411270911.0, + "reward": 0.8885416746139526, + "reward_std": 0.07167095690965652, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8885416746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17076411694288254, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001612901687622, + "sampling/importance_sampling_ratio/min": 0.38393630981445315, + "sampling/sampling_logp_difference/max": 1.1111455321311952, + "sampling/sampling_logp_difference/mean": 0.013075116835534573, + "step": 3030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1862.6, + "completions/max_terminated_length": 1852.8, + "completions/mean_length": 1280.84375, + "completions/mean_terminated_length": 1263.3064208984374, + "completions/min_length": 963.8, + "completions/min_terminated_length": 963.8, + "entropy": 0.28606864213943484, + "epoch": 3.5663924794359576, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7895479798316956, + "learning_rate": 1.4792827719893384e-07, + "loss": -0.0009, + "num_tokens": 411973449.0, + "reward": 0.9044791698455811, + "reward_std": 0.10300752446055413, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9044791698455811, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18286249935626983, + "sampling/importance_sampling_ratio/max": 1.9817769050598144, + "sampling/importance_sampling_ratio/mean": 1.0001332998275756, + "sampling/importance_sampling_ratio/min": 0.44017385244369506, + "sampling/sampling_logp_difference/max": 0.8357086300849914, + "sampling/sampling_logp_difference/mean": 0.013966779969632626, + "step": 3035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1691.8, + "completions/max_terminated_length": 1691.8, + "completions/mean_length": 1224.190625, + "completions/mean_terminated_length": 1224.190625, + "completions/min_length": 903.6, + "completions/min_terminated_length": 903.6, + "entropy": 0.28389262557029726, + "epoch": 3.5722679200940073, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7468788623809814, + "learning_rate": 1.473225102980373e-07, + "loss": -0.0036, + "num_tokens": 412704646.0, + "reward": 0.768750011920929, + "reward_std": 0.09476765841245652, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.768750011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27755117118358613, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0, + "sampling/importance_sampling_ratio/min": 0.33260057866573334, + "sampling/sampling_logp_difference/max": 1.1722631096839904, + "sampling/sampling_logp_difference/mean": 0.014068802073597907, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1792.6, + "completions/max_terminated_length": 1792.6, + "completions/mean_length": 1272.9375, + "completions/mean_terminated_length": 1272.9375, + "completions/min_length": 977.2, + "completions/min_terminated_length": 977.2, + "entropy": 0.282851442694664, + "epoch": 3.5781433607520565, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 1.4671674339714076e-07, + "loss": 0.0014, + "num_tokens": 413436114.0, + "reward": 0.7989583492279053, + "reward_std": 0.09803546294569969, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7989583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28233606219291685, + "sampling/importance_sampling_ratio/max": 1.9849604606628417, + "sampling/importance_sampling_ratio/mean": 0.9999540328979493, + "sampling/importance_sampling_ratio/min": 0.30122940987348557, + "sampling/sampling_logp_difference/max": 1.4530311226844788, + "sampling/sampling_logp_difference/mean": 0.01404977347701788, + "step": 3045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1722.4, + "completions/max_terminated_length": 1722.4, + "completions/mean_length": 1213.70625, + "completions/mean_terminated_length": 1213.70625, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "entropy": 0.28000237941741946, + "epoch": 3.5840188014101058, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7278285622596741, + "learning_rate": 1.4611097649624423e-07, + "loss": -0.002, + "num_tokens": 414138340.0, + "reward": 0.8671875, + "reward_std": 0.0896983802318573, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8671875119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24096697121858596, + "sampling/importance_sampling_ratio/max": 1.9819402933120727, + "sampling/importance_sampling_ratio/mean": 0.9999994993209839, + "sampling/importance_sampling_ratio/min": 0.3462633116170764, + "sampling/sampling_logp_difference/max": 1.7581159114837646, + "sampling/sampling_logp_difference/mean": 0.013958721049129963, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1755.4, + "completions/max_terminated_length": 1719.8, + "completions/mean_length": 1299.853125, + "completions/mean_terminated_length": 1289.995361328125, + "completions/min_length": 955.6, + "completions/min_terminated_length": 955.6, + "entropy": 0.2778258055448532, + "epoch": 3.589894242068155, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.42271292209625244, + "learning_rate": 1.4550520959534772e-07, + "loss": -0.0104, + "num_tokens": 414857417.0, + "reward": 0.7609375238418579, + "reward_std": 0.07389688491821289, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7609375238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29788713455200194, + "sampling/importance_sampling_ratio/max": 1.9975370168685913, + "sampling/importance_sampling_ratio/mean": 1.0000292778015136, + "sampling/importance_sampling_ratio/min": 0.32156047150492667, + "sampling/sampling_logp_difference/max": 1.5476860523223877, + "sampling/sampling_logp_difference/mean": 0.013813853822648525, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1609.8, + "completions/max_terminated_length": 1609.8, + "completions/mean_length": 1203.35625, + "completions/mean_terminated_length": 1203.35625, + "completions/min_length": 942.4, + "completions/min_terminated_length": 942.4, + "entropy": 0.27206062972545625, + "epoch": 3.5957696827262042, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7698452472686768, + "learning_rate": 1.4489944269445118e-07, + "loss": 0.0086, + "num_tokens": 415559611.0, + "reward": 0.8005208611488343, + "reward_std": 0.09430278986692428, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8005208611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28068689703941346, + "sampling/importance_sampling_ratio/max": 1.9747754335403442, + "sampling/importance_sampling_ratio/mean": 0.999997878074646, + "sampling/importance_sampling_ratio/min": 0.3823790907859802, + "sampling/sampling_logp_difference/max": 1.0575198411941529, + "sampling/sampling_logp_difference/mean": 0.01356248427182436, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1615.8, + "completions/max_terminated_length": 1615.8, + "completions/mean_length": 1229.265625, + "completions/mean_terminated_length": 1229.265625, + "completions/min_length": 946.2, + "completions/min_terminated_length": 946.2, + "entropy": 0.28500559329986574, + "epoch": 3.601645123384254, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.7499962449073792, + "learning_rate": 1.4429367579355464e-07, + "loss": 0.0021, + "num_tokens": 416258560.0, + "reward": 0.8640625238418579, + "reward_std": 0.12375695258378983, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8640625238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24018340706825256, + "sampling/importance_sampling_ratio/max": 1.937759232521057, + "sampling/importance_sampling_ratio/mean": 0.9999950051307678, + "sampling/importance_sampling_ratio/min": 0.37119474411010744, + "sampling/sampling_logp_difference/max": 1.0601333975791931, + "sampling/sampling_logp_difference/mean": 0.014097846299409866, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1569.8, + "completions/max_terminated_length": 1569.8, + "completions/mean_length": 1160.625, + "completions/mean_terminated_length": 1160.625, + "completions/min_length": 896.8, + "completions/min_terminated_length": 896.8, + "entropy": 0.25845094621181486, + "epoch": 3.607520564042303, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5670578479766846, + "learning_rate": 1.436879088926581e-07, + "loss": -0.0026, + "num_tokens": 416934696.0, + "reward": 0.8625000238418579, + "reward_std": 0.10116852670907975, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8625000238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24681516885757446, + "sampling/importance_sampling_ratio/max": 1.9113659143447876, + "sampling/importance_sampling_ratio/mean": 1.0000329732894897, + "sampling/importance_sampling_ratio/min": 0.39745662212371824, + "sampling/sampling_logp_difference/max": 0.9369120836257935, + "sampling/sampling_logp_difference/mean": 0.012975651957094669, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1579.6, + "completions/max_terminated_length": 1579.6, + "completions/mean_length": 1190.0125, + "completions/mean_terminated_length": 1190.0125, + "completions/min_length": 852.0, + "completions/min_terminated_length": 852.0, + "entropy": 0.28933039903640745, + "epoch": 3.6133960047003524, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7078462243080139, + "learning_rate": 1.4308214199176154e-07, + "loss": 0.0015, + "num_tokens": 417633500.0, + "reward": 0.9078125119209289, + "reward_std": 0.07728993520140648, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9078125119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1712805688381195, + "sampling/importance_sampling_ratio/max": 1.9024327516555786, + "sampling/importance_sampling_ratio/mean": 0.9999622344970703, + "sampling/importance_sampling_ratio/min": 0.3870242595672607, + "sampling/sampling_logp_difference/max": 0.9816065073013306, + "sampling/sampling_logp_difference/mean": 0.01414424292743206, + "step": 3075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 1763.8, + "completions/max_terminated_length": 1761.4, + "completions/mean_length": 1259.35625, + "completions/mean_terminated_length": 1245.55146484375, + "completions/min_length": 953.0, + "completions/min_terminated_length": 953.0, + "entropy": 0.2990762531757355, + "epoch": 3.619271445358402, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.47593507170677185, + "learning_rate": 1.4247637509086503e-07, + "loss": -0.0174, + "num_tokens": 418346446.0, + "reward": 0.9101562738418579, + "reward_std": 0.11092746555805207, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9101562738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20071874484419822, + "sampling/importance_sampling_ratio/max": 1.9293018341064454, + "sampling/importance_sampling_ratio/mean": 0.9999398589134216, + "sampling/importance_sampling_ratio/min": 0.31697064116597173, + "sampling/sampling_logp_difference/max": 1.6223263263702392, + "sampling/sampling_logp_difference/mean": 0.01478542685508728, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1825.0, + "completions/max_terminated_length": 1674.8, + "completions/mean_length": 1224.38125, + "completions/mean_terminated_length": 1220.5604736328125, + "completions/min_length": 927.4, + "completions/min_terminated_length": 927.4, + "entropy": 0.2816446602344513, + "epoch": 3.6251468860164513, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.45582300424575806, + "learning_rate": 1.418706081899685e-07, + "loss": -0.0189, + "num_tokens": 419068628.0, + "reward": 0.879687511920929, + "reward_std": 0.07607024312019348, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.879687511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1917457178235054, + "sampling/importance_sampling_ratio/max": 1.9380622863769532, + "sampling/importance_sampling_ratio/mean": 0.9999940156936645, + "sampling/importance_sampling_ratio/min": 0.38542511463165285, + "sampling/sampling_logp_difference/max": 0.9764352560043335, + "sampling/sampling_logp_difference/mean": 0.013983016833662987, + "step": 3085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1821.0, + "completions/max_terminated_length": 1821.0, + "completions/mean_length": 1329.95625, + "completions/mean_terminated_length": 1329.95625, + "completions/min_length": 977.6, + "completions/min_terminated_length": 977.6, + "entropy": 0.26992570161819457, + "epoch": 3.6310223266745005, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4041450619697571, + "learning_rate": 1.4126484128907196e-07, + "loss": -0.0004, + "num_tokens": 419780486.0, + "reward": 0.7979166746139527, + "reward_std": 0.09113014414906502, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7979166746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3225852161645889, + "sampling/importance_sampling_ratio/max": 1.9687642812728883, + "sampling/importance_sampling_ratio/mean": 1.0000601291656495, + "sampling/importance_sampling_ratio/min": 0.3267024874687195, + "sampling/sampling_logp_difference/max": 1.3370716214179992, + "sampling/sampling_logp_difference/mean": 0.013374082185328007, + "step": 3090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1923.0, + "completions/max_terminated_length": 1923.0, + "completions/mean_length": 1279.303125, + "completions/mean_terminated_length": 1279.303125, + "completions/min_length": 959.6, + "completions/min_terminated_length": 959.6, + "entropy": 0.29384331703186034, + "epoch": 3.6368977673325498, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.37273073196411133, + "learning_rate": 1.4065907438817542e-07, + "loss": 0.0016, + "num_tokens": 420497159.0, + "reward": 0.8317708492279052, + "reward_std": 0.07642179653048516, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8317708492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.14986341148614885, + "sampling/importance_sampling_ratio/max": 1.9512465000152588, + "sampling/importance_sampling_ratio/mean": 1.0000032186508179, + "sampling/importance_sampling_ratio/min": 0.3234915256500244, + "sampling/sampling_logp_difference/max": 1.1340311884880065, + "sampling/sampling_logp_difference/mean": 0.014189736545085907, + "step": 3095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1810.6, + "completions/max_terminated_length": 1810.6, + "completions/mean_length": 1364.653125, + "completions/mean_terminated_length": 1364.653125, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "entropy": 0.2985062301158905, + "epoch": 3.6427732079905994, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.40833938121795654, + "learning_rate": 1.4005330748727888e-07, + "loss": 0.0033, + "num_tokens": 421275000.0, + "reward": 0.8166146039962768, + "reward_std": 0.07883021160960198, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8166146039962768, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2833401970565319, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000027179718018, + "sampling/importance_sampling_ratio/min": 0.19541268646717072, + "sampling/sampling_logp_difference/max": 1.775990653038025, + "sampling/sampling_logp_difference/mean": 0.014779189042747021, + "step": 3100 + }, + { + "epoch": 3.6427732079905994, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1717.16, + "eval_completions/max_terminated_length": 1717.16, + "eval_completions/mean_length": 1257.2675, + "eval_completions/mean_terminated_length": 1257.2675, + "eval_completions/min_length": 936.28, + "eval_completions/min_terminated_length": 936.28, + "eval_entropy": 0.29172040104866026, + "eval_frac_reward_zero_std": 0.58, + "eval_loss": -0.0004253547522239387, + "eval_num_tokens": 421275000.0, + "eval_reward": 0.7633229267597198, + "eval_reward_std": 0.0836981363594532, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7633229267597198, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2984502202272415, + "eval_runtime": 473.8791, + "eval_samples_per_second": 0.211, + "eval_sampling/importance_sampling_ratio/max": 1.9595999717712402, + "eval_sampling/importance_sampling_ratio/mean": 0.999986469745636, + "eval_sampling/importance_sampling_ratio/min": 0.3356508800573647, + "eval_sampling/sampling_logp_difference/max": 1.4490785884857178, + "eval_sampling/sampling_logp_difference/mean": 0.01437144923955202, + "eval_steps_per_second": 0.004, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1738.0, + "completions/max_terminated_length": 1738.0, + "completions/mean_length": 1287.340625, + "completions/mean_terminated_length": 1287.340625, + "completions/min_length": 956.6, + "completions/min_terminated_length": 956.6, + "entropy": 0.30881457328796386, + "epoch": 3.6486486486486487, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.34845200181007385, + "learning_rate": 1.3944754058638237e-07, + "loss": 0.0014, + "num_tokens": 421994277.0, + "reward": 0.7625, + "reward_std": 0.10787137746810913, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.762500011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2934652417898178, + "sampling/importance_sampling_ratio/max": 1.8299882888793946, + "sampling/importance_sampling_ratio/mean": 0.9999529242515564, + "sampling/importance_sampling_ratio/min": 0.32334981858730316, + "sampling/sampling_logp_difference/max": 1.260542106628418, + "sampling/sampling_logp_difference/mean": 0.015047940611839294, + "step": 3105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1736.6, + "completions/max_terminated_length": 1736.6, + "completions/mean_length": 1294.334375, + "completions/mean_terminated_length": 1294.334375, + "completions/min_length": 1006.8, + "completions/min_terminated_length": 1006.8, + "entropy": 0.295719313621521, + "epoch": 3.654524089306698, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.37620779871940613, + "learning_rate": 1.3884177368548583e-07, + "loss": 0.0024, + "num_tokens": 422722320.0, + "reward": 0.8888020873069763, + "reward_std": 0.08835629969835282, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8888020873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17617078721523285, + "sampling/importance_sampling_ratio/max": 1.8987627744674682, + "sampling/importance_sampling_ratio/mean": 0.9999510288238526, + "sampling/importance_sampling_ratio/min": 0.39632954001426696, + "sampling/sampling_logp_difference/max": 1.019751000404358, + "sampling/sampling_logp_difference/mean": 0.014330669678747654, + "step": 3110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1861.6, + "completions/max_terminated_length": 1861.6, + "completions/mean_length": 1327.196875, + "completions/mean_terminated_length": 1327.196875, + "completions/min_length": 927.8, + "completions/min_terminated_length": 927.8, + "entropy": 0.30390411615371704, + "epoch": 3.6603995299647476, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.3971744179725647, + "learning_rate": 1.3823600678458927e-07, + "loss": -0.0021, + "num_tokens": 423456351.0, + "reward": 0.8244791805744172, + "reward_std": 0.04826573207974434, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8244791865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.155859262496233, + "sampling/importance_sampling_ratio/max": 1.949301791191101, + "sampling/importance_sampling_ratio/mean": 0.9999527335166931, + "sampling/importance_sampling_ratio/min": 0.40473890900611875, + "sampling/sampling_logp_difference/max": 1.2450590133666992, + "sampling/sampling_logp_difference/mean": 0.014594112709164619, + "step": 3115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1975.2, + "completions/max_terminated_length": 1975.2, + "completions/mean_length": 1400.11875, + "completions/mean_terminated_length": 1400.11875, + "completions/min_length": 996.2, + "completions/min_terminated_length": 996.2, + "entropy": 0.29585008025169374, + "epoch": 3.666274970622797, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6170745491981506, + "learning_rate": 1.3763023988369273e-07, + "loss": 0.0028, + "num_tokens": 424206165.0, + "reward": 0.8633333444595337, + "reward_std": 0.08815523274242879, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8633333444595337, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24421373009681702, + "sampling/importance_sampling_ratio/max": 1.9507896423339843, + "sampling/importance_sampling_ratio/mean": 1.0000440001487731, + "sampling/importance_sampling_ratio/min": 0.3013356953859329, + "sampling/sampling_logp_difference/max": 1.284391450881958, + "sampling/sampling_logp_difference/mean": 0.01431298851966858, + "step": 3120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1929.2, + "completions/max_terminated_length": 1880.8, + "completions/mean_length": 1353.30625, + "completions/mean_terminated_length": 1350.4684326171875, + "completions/min_length": 1014.6, + "completions/min_terminated_length": 1014.6, + "entropy": 0.29437545537948606, + "epoch": 3.672150411280846, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7374795079231262, + "learning_rate": 1.370244729827962e-07, + "loss": 0.0048, + "num_tokens": 424945763.0, + "reward": 0.8335937619209289, + "reward_std": 0.08211355954408646, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8335937619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2667385458946228, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000265955924987, + "sampling/importance_sampling_ratio/min": 0.2846124005503953, + "sampling/sampling_logp_difference/max": 2.374808597564697, + "sampling/sampling_logp_difference/mean": 0.014504742994904517, + "step": 3125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1820.4, + "completions/max_terminated_length": 1820.4, + "completions/mean_length": 1330.065625, + "completions/mean_terminated_length": 1330.065625, + "completions/min_length": 988.8, + "completions/min_terminated_length": 988.8, + "entropy": 0.2882347762584686, + "epoch": 3.6780258519388953, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.58107990026474, + "learning_rate": 1.3641870608189969e-07, + "loss": -0.0016, + "num_tokens": 425713832.0, + "reward": 0.8757812738418579, + "reward_std": 0.08430513888597488, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8757812738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21622689962387084, + "sampling/importance_sampling_ratio/max": 1.990927243232727, + "sampling/importance_sampling_ratio/mean": 0.9999788165092468, + "sampling/importance_sampling_ratio/min": 0.3501891404390335, + "sampling/sampling_logp_difference/max": 1.1427728176116942, + "sampling/sampling_logp_difference/mean": 0.013877778686583042, + "step": 3130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1819.2, + "completions/max_terminated_length": 1819.2, + "completions/mean_length": 1303.94375, + "completions/mean_terminated_length": 1303.94375, + "completions/min_length": 986.8, + "completions/min_terminated_length": 986.8, + "entropy": 0.2957104444503784, + "epoch": 3.6839012925969445, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 1.3581293918100315e-07, + "loss": 0.0038, + "num_tokens": 426498822.0, + "reward": 0.760937511920929, + "reward_std": 0.06861766874790191, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.760937511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.401876425743103, + "sampling/importance_sampling_ratio/max": 1.9452528476715087, + "sampling/importance_sampling_ratio/mean": 0.9999453067779541, + "sampling/importance_sampling_ratio/min": 0.2957945063710213, + "sampling/sampling_logp_difference/max": 1.479243540763855, + "sampling/sampling_logp_difference/mean": 0.014747031778097153, + "step": 3135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1718.2, + "completions/max_terminated_length": 1718.2, + "completions/mean_length": 1282.125, + "completions/mean_terminated_length": 1282.125, + "completions/min_length": 988.6, + "completions/min_terminated_length": 988.6, + "entropy": 0.28800985813140867, + "epoch": 3.6897767332549942, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5976863503456116, + "learning_rate": 1.352071722801066e-07, + "loss": -0.002, + "num_tokens": 427214238.0, + "reward": 0.8390625238418579, + "reward_std": 0.06042231023311615, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8390625238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28592842221260073, + "sampling/importance_sampling_ratio/max": 1.9544946670532226, + "sampling/importance_sampling_ratio/mean": 0.9999897599220275, + "sampling/importance_sampling_ratio/min": 0.2928850159049034, + "sampling/sampling_logp_difference/max": 1.3969106674194336, + "sampling/sampling_logp_difference/mean": 0.014067772217094899, + "step": 3140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1836.0, + "completions/max_terminated_length": 1836.0, + "completions/mean_length": 1300.1125, + "completions/mean_terminated_length": 1300.1125, + "completions/min_length": 907.2, + "completions/min_terminated_length": 907.2, + "entropy": 0.28729010820388795, + "epoch": 3.6956521739130435, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8007182478904724, + "learning_rate": 1.3460140537921007e-07, + "loss": -0.0018, + "num_tokens": 427997730.0, + "reward": 0.8020833492279053, + "reward_std": 0.13551612198352814, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8020833492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2759165666997433, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000725269317627, + "sampling/importance_sampling_ratio/min": 0.20970812886953355, + "sampling/sampling_logp_difference/max": 1.6685986518859863, + "sampling/sampling_logp_difference/mean": 0.014266648329794407, + "step": 3145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1675.2, + "completions/max_terminated_length": 1675.2, + "completions/mean_length": 1290.090625, + "completions/mean_terminated_length": 1290.090625, + "completions/min_length": 965.8, + "completions/min_terminated_length": 965.8, + "entropy": 0.28130186200141905, + "epoch": 3.7015276145710927, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.35631677508354187, + "learning_rate": 1.3399563847831354e-07, + "loss": -0.0013, + "num_tokens": 428694863.0, + "reward": 0.8708854198455811, + "reward_std": 0.05269531458616257, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8708854198455811, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22914678156375884, + "sampling/importance_sampling_ratio/max": 1.9211535692214965, + "sampling/importance_sampling_ratio/mean": 0.9999082922935486, + "sampling/importance_sampling_ratio/min": 0.37740443348884584, + "sampling/sampling_logp_difference/max": 1.008909249305725, + "sampling/sampling_logp_difference/mean": 0.013886995241045951, + "step": 3150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1931.8, + "completions/max_terminated_length": 1931.8, + "completions/mean_length": 1323.0625, + "completions/mean_terminated_length": 1323.0625, + "completions/min_length": 911.6, + "completions/min_terminated_length": 911.6, + "entropy": 0.3171755850315094, + "epoch": 3.7074030552291424, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7349398136138916, + "learning_rate": 1.33389871577417e-07, + "loss": -0.0039, + "num_tokens": 429435427.0, + "reward": 0.8898437738418579, + "reward_std": 0.04950306043028831, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8898437738418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.176000939309597, + "sampling/importance_sampling_ratio/max": 1.988955020904541, + "sampling/importance_sampling_ratio/mean": 0.9999869346618653, + "sampling/importance_sampling_ratio/min": 0.4141034007072449, + "sampling/sampling_logp_difference/max": 0.8982202529907226, + "sampling/sampling_logp_difference/mean": 0.015075892955064774, + "step": 3155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1743.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 1276.153125, + "completions/mean_terminated_length": 1276.153125, + "completions/min_length": 955.2, + "completions/min_terminated_length": 955.2, + "entropy": 0.2771688997745514, + "epoch": 3.7132784958871916, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5998103618621826, + "learning_rate": 1.3278410467652046e-07, + "loss": -0.0041, + "num_tokens": 430172244.0, + "reward": 0.9096354246139526, + "reward_std": 0.0633207380771637, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9096354246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16724707335233688, + "sampling/importance_sampling_ratio/max": 1.961449098587036, + "sampling/importance_sampling_ratio/mean": 0.9999847888946534, + "sampling/importance_sampling_ratio/min": 0.3520124971866608, + "sampling/sampling_logp_difference/max": 1.2267968893051147, + "sampling/sampling_logp_difference/mean": 0.013828900456428529, + "step": 3160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1734.0, + "completions/max_terminated_length": 1734.0, + "completions/mean_length": 1276.95, + "completions/mean_terminated_length": 1276.95, + "completions/min_length": 936.2, + "completions/min_terminated_length": 936.2, + "entropy": 0.2937505543231964, + "epoch": 3.719153936545241, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.5660243034362793, + "learning_rate": 1.3217833777562393e-07, + "loss": -0.0023, + "num_tokens": 430906164.0, + "reward": 0.8408854246139527, + "reward_std": 0.0947806142270565, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8408854246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29198580980300903, + "sampling/importance_sampling_ratio/max": 1.9845717906951905, + "sampling/importance_sampling_ratio/mean": 1.0000143647193909, + "sampling/importance_sampling_ratio/min": 0.36637015342712403, + "sampling/sampling_logp_difference/max": 1.0281299352645874, + "sampling/sampling_logp_difference/mean": 0.014127112366259098, + "step": 3165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1809.8, + "completions/max_terminated_length": 1809.8, + "completions/mean_length": 1347.85625, + "completions/mean_terminated_length": 1347.85625, + "completions/min_length": 1005.0, + "completions/min_terminated_length": 1005.0, + "entropy": 0.2895920634269714, + "epoch": 3.72502937720329, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6063299775123596, + "learning_rate": 1.315725708747274e-07, + "loss": 0.0046, + "num_tokens": 431671334.0, + "reward": 0.8238020896911621, + "reward_std": 0.06915819272398949, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8238020896911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21795693933963775, + "sampling/importance_sampling_ratio/max": 1.9383326530456544, + "sampling/importance_sampling_ratio/mean": 0.9999857902526855, + "sampling/importance_sampling_ratio/min": 0.31745859086513517, + "sampling/sampling_logp_difference/max": 1.318444514274597, + "sampling/sampling_logp_difference/mean": 0.014070061966776848, + "step": 3170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1714.2, + "completions/max_terminated_length": 1714.2, + "completions/mean_length": 1256.196875, + "completions/mean_terminated_length": 1256.196875, + "completions/min_length": 964.8, + "completions/min_terminated_length": 964.8, + "entropy": 0.2780440628528595, + "epoch": 3.7309048178613398, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4056414067745209, + "learning_rate": 1.3096680397383085e-07, + "loss": -0.0001, + "num_tokens": 432385749.0, + "reward": 0.903125, + "reward_std": 0.06527099013328552, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.903125, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1702724814414978, + "sampling/importance_sampling_ratio/max": 1.9694818019866944, + "sampling/importance_sampling_ratio/mean": 0.9998815774917602, + "sampling/importance_sampling_ratio/min": 0.30392550230026244, + "sampling/sampling_logp_difference/max": 1.3078163862228394, + "sampling/sampling_logp_difference/mean": 0.013784093409776687, + "step": 3175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1895.6, + "completions/max_terminated_length": 1895.6, + "completions/mean_length": 1292.0875, + "completions/mean_terminated_length": 1292.0875, + "completions/min_length": 922.4, + "completions/min_terminated_length": 922.4, + "entropy": 0.2698298662900925, + "epoch": 3.736780258519389, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5389244556427002, + "learning_rate": 1.3036103707293434e-07, + "loss": -0.0008, + "num_tokens": 433123985.0, + "reward": 0.8802083611488343, + "reward_std": 0.09341669231653213, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8802083611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23054814338684082, + "sampling/importance_sampling_ratio/max": 1.9669621229171752, + "sampling/importance_sampling_ratio/mean": 1.0000298500061036, + "sampling/importance_sampling_ratio/min": 0.3251545369625092, + "sampling/sampling_logp_difference/max": 1.1636561393737792, + "sampling/sampling_logp_difference/mean": 0.0135076355189085, + "step": 3180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1696.8, + "completions/max_terminated_length": 1696.8, + "completions/mean_length": 1231.425, + "completions/mean_terminated_length": 1231.425, + "completions/min_length": 858.2, + "completions/min_terminated_length": 858.2, + "entropy": 0.28715863823890686, + "epoch": 3.7426556991774382, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 1.297552701720378e-07, + "loss": -0.0024, + "num_tokens": 433831369.0, + "reward": 0.8338541746139526, + "reward_std": 0.05719553902745247, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8338541746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17622071206569673, + "sampling/importance_sampling_ratio/max": 1.9232003927230834, + "sampling/importance_sampling_ratio/mean": 0.9998136281967163, + "sampling/importance_sampling_ratio/min": 0.38944405019283296, + "sampling/sampling_logp_difference/max": 1.0467731714248658, + "sampling/sampling_logp_difference/mean": 0.014088386856019496, + "step": 3185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1944.8, + "completions/max_terminated_length": 1944.8, + "completions/mean_length": 1343.6875, + "completions/mean_terminated_length": 1343.6875, + "completions/min_length": 998.2, + "completions/min_terminated_length": 998.2, + "entropy": 0.28584455847740176, + "epoch": 3.748531139835488, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7371358871459961, + "learning_rate": 1.2914950327114127e-07, + "loss": 0.0072, + "num_tokens": 434593301.0, + "reward": 0.7756250143051148, + "reward_std": 0.0602168183773756, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7756250143051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29703550338745116, + "sampling/importance_sampling_ratio/max": 1.9905659914016725, + "sampling/importance_sampling_ratio/mean": 1.0000561237335206, + "sampling/importance_sampling_ratio/min": 0.382511293888092, + "sampling/sampling_logp_difference/max": 1.0009687900543214, + "sampling/sampling_logp_difference/mean": 0.01421151626855135, + "step": 3190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1796.8, + "completions/max_terminated_length": 1796.8, + "completions/mean_length": 1256.571875, + "completions/mean_terminated_length": 1256.571875, + "completions/min_length": 887.8, + "completions/min_terminated_length": 887.8, + "entropy": 0.2753683507442474, + "epoch": 3.754406580493537, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7251403331756592, + "learning_rate": 1.2854373637024473e-07, + "loss": -0.0012, + "num_tokens": 435349260.0, + "reward": 0.9098958492279052, + "reward_std": 0.07237763702869415, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9098958492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19696723371744157, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999914765357971, + "sampling/importance_sampling_ratio/min": 0.30798264741897585, + "sampling/sampling_logp_difference/max": 1.2155251502990723, + "sampling/sampling_logp_difference/mean": 0.013821718096733094, + "step": 3195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1932.0, + "completions/max_terminated_length": 1932.0, + "completions/mean_length": 1318.975, + "completions/mean_terminated_length": 1318.975, + "completions/min_length": 970.8, + "completions/min_terminated_length": 970.8, + "entropy": 0.2900474309921265, + "epoch": 3.7602820211515864, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6051058769226074, + "learning_rate": 1.2793796946934817e-07, + "loss": 0.009, + "num_tokens": 436108212.0, + "reward": 0.7828125, + "reward_std": 0.10932088047266006, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7828125, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28247700333595277, + "sampling/importance_sampling_ratio/max": 1.9975552320480348, + "sampling/importance_sampling_ratio/mean": 0.9999950647354126, + "sampling/importance_sampling_ratio/min": 0.34100759625434873, + "sampling/sampling_logp_difference/max": 1.166985023021698, + "sampling/sampling_logp_difference/mean": 0.014263258688151837, + "step": 3200 + }, + { + "epoch": 3.7602820211515864, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1706.72, + "eval_completions/max_terminated_length": 1706.72, + "eval_completions/mean_length": 1246.888125, + "eval_completions/mean_terminated_length": 1246.888125, + "eval_completions/min_length": 955.2, + "eval_completions/min_terminated_length": 955.2, + "eval_entropy": 0.2836748969554901, + "eval_frac_reward_zero_std": 0.58, + "eval_loss": 0.0020352269057184458, + "eval_num_tokens": 436108212.0, + "eval_reward": 0.7717708504199982, + "eval_reward_std": 0.08081570498645306, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7717708504199982, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29261682510375975, + "eval_runtime": 476.9187, + "eval_samples_per_second": 0.21, + "eval_sampling/importance_sampling_ratio/max": 1.9656561374664308, + "eval_sampling/importance_sampling_ratio/mean": 0.9999639821052552, + "eval_sampling/importance_sampling_ratio/min": 0.30387570122024044, + "eval_sampling/sampling_logp_difference/max": 1.6534623003005982, + "eval_sampling/sampling_logp_difference/mean": 0.01408041562885046, + "eval_steps_per_second": 0.004, + "step": 3200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1647.4, + "completions/max_terminated_length": 1647.4, + "completions/mean_length": 1226.03125, + "completions/mean_terminated_length": 1226.03125, + "completions/min_length": 913.4, + "completions/min_terminated_length": 913.4, + "entropy": 0.2630267202854156, + "epoch": 3.7661574618096356, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.0, + "learning_rate": 1.2733220256845166e-07, + "loss": 0.0038, + "num_tokens": 436822078.0, + "reward": 0.9191145896911621, + "reward_std": 0.024364107847213747, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9191145896911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1650282010436058, + "sampling/importance_sampling_ratio/max": 1.9180307149887086, + "sampling/importance_sampling_ratio/mean": 0.9998865008354187, + "sampling/importance_sampling_ratio/min": 0.3275682792067528, + "sampling/sampling_logp_difference/max": 1.269556188583374, + "sampling/sampling_logp_difference/mean": 0.013227949663996697, + "step": 3205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1672.2, + "completions/max_terminated_length": 1672.2, + "completions/mean_length": 1261.959375, + "completions/mean_terminated_length": 1261.959375, + "completions/min_length": 888.4, + "completions/min_terminated_length": 888.4, + "entropy": 0.2816715180873871, + "epoch": 3.772032902467685, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.0, + "learning_rate": 1.2672643566755512e-07, + "loss": -0.0017, + "num_tokens": 437541409.0, + "reward": 0.8677083492279053, + "reward_std": 0.031082433462142945, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8677083492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24662085697054864, + "sampling/importance_sampling_ratio/max": 1.9304154872894288, + "sampling/importance_sampling_ratio/mean": 0.9999831080436706, + "sampling/importance_sampling_ratio/min": 0.4297758400440216, + "sampling/sampling_logp_difference/max": 0.8534170627593994, + "sampling/sampling_logp_difference/mean": 0.014014366827905179, + "step": 3210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1905.6, + "completions/max_terminated_length": 1905.6, + "completions/mean_length": 1288.709375, + "completions/mean_terminated_length": 1288.709375, + "completions/min_length": 924.4, + "completions/min_terminated_length": 924.4, + "entropy": 0.29503530263900757, + "epoch": 3.7779083431257345, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.3730573058128357, + "learning_rate": 1.2612066876665858e-07, + "loss": -0.0028, + "num_tokens": 438272292.0, + "reward": 0.94453125, + "reward_std": 0.06546878516674041, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.94453125, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1374159798026085, + "sampling/importance_sampling_ratio/max": 1.9494167804718017, + "sampling/importance_sampling_ratio/mean": 1.0000098466873169, + "sampling/importance_sampling_ratio/min": 0.3806808590888977, + "sampling/sampling_logp_difference/max": 1.0228654861450195, + "sampling/sampling_logp_difference/mean": 0.014348461478948592, + "step": 3215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1635.2, + "completions/max_terminated_length": 1635.2, + "completions/mean_length": 1204.7875, + "completions/mean_terminated_length": 1204.7875, + "completions/min_length": 843.6, + "completions/min_terminated_length": 843.6, + "entropy": 0.2663033068180084, + "epoch": 3.7837837837837838, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 1.2551490186576204e-07, + "loss": 0.0036, + "num_tokens": 438955920.0, + "reward": 0.8723958492279053, + "reward_std": 0.06076589897274971, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8723958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2042554274201393, + "sampling/importance_sampling_ratio/max": 1.8793493509292603, + "sampling/importance_sampling_ratio/mean": 1.0000537037849426, + "sampling/importance_sampling_ratio/min": 0.37652627825737, + "sampling/sampling_logp_difference/max": 1.1007208824157715, + "sampling/sampling_logp_difference/mean": 0.013395345583558083, + "step": 3220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1841.0, + "completions/max_terminated_length": 1841.0, + "completions/mean_length": 1314.41875, + "completions/mean_terminated_length": 1314.41875, + "completions/min_length": 998.6, + "completions/min_terminated_length": 998.6, + "entropy": 0.29712930917739866, + "epoch": 3.789659224441833, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5394260287284851, + "learning_rate": 1.249091349648655e-07, + "loss": 0.0071, + "num_tokens": 439713382.0, + "reward": 0.8750000119209289, + "reward_std": 0.08217244297266006, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8750000119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21217068284749985, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999483704566956, + "sampling/importance_sampling_ratio/min": 0.3432030320167542, + "sampling/sampling_logp_difference/max": 1.1881095886230468, + "sampling/sampling_logp_difference/mean": 0.014405792579054832, + "step": 3225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1802.2, + "completions/max_terminated_length": 1802.2, + "completions/mean_length": 1308.49375, + "completions/mean_terminated_length": 1308.49375, + "completions/min_length": 957.8, + "completions/min_terminated_length": 957.8, + "entropy": 0.2827461302280426, + "epoch": 3.7955346650998827, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6620894074440002, + "learning_rate": 1.2430336806396897e-07, + "loss": -0.0002, + "num_tokens": 440436756.0, + "reward": 0.7843750238418579, + "reward_std": 0.07676660120487214, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7843750238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28354223668575285, + "sampling/importance_sampling_ratio/max": 1.9482336044311523, + "sampling/importance_sampling_ratio/mean": 1.0000541090965271, + "sampling/importance_sampling_ratio/min": 0.23740711510181428, + "sampling/sampling_logp_difference/max": 1.610867190361023, + "sampling/sampling_logp_difference/mean": 0.013959074392914772, + "step": 3230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1830.4, + "completions/max_terminated_length": 1830.4, + "completions/mean_length": 1286.35, + "completions/mean_terminated_length": 1286.35, + "completions/min_length": 939.2, + "completions/min_terminated_length": 939.2, + "entropy": 0.2779347479343414, + "epoch": 3.801410105757932, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7190999388694763, + "learning_rate": 1.2369760116307243e-07, + "loss": 0.0003, + "num_tokens": 441181252.0, + "reward": 0.8755208373069763, + "reward_std": 0.10408329591155052, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8755208373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21340711712837218, + "sampling/importance_sampling_ratio/max": 1.9635958909988402, + "sampling/importance_sampling_ratio/mean": 1.0000646114349365, + "sampling/importance_sampling_ratio/min": 0.26795525550842286, + "sampling/sampling_logp_difference/max": 1.4363911390304565, + "sampling/sampling_logp_difference/mean": 0.014074122533202171, + "step": 3235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1587.4, + "completions/max_terminated_length": 1587.4, + "completions/mean_length": 1182.74375, + "completions/mean_terminated_length": 1182.74375, + "completions/min_length": 876.8, + "completions/min_terminated_length": 876.8, + "entropy": 0.27898582220077517, + "epoch": 3.807285546415981, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4463542401790619, + "learning_rate": 1.2309183426217592e-07, + "loss": 0.0065, + "num_tokens": 441885602.0, + "reward": 0.861718761920929, + "reward_std": 0.0897410586476326, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.861718761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25226952880620956, + "sampling/importance_sampling_ratio/max": 1.9250823497772216, + "sampling/importance_sampling_ratio/mean": 0.9998960614204406, + "sampling/importance_sampling_ratio/min": 0.3237256646156311, + "sampling/sampling_logp_difference/max": 1.26419837474823, + "sampling/sampling_logp_difference/mean": 0.014087118953466416, + "step": 3240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1780.0, + "completions/max_terminated_length": 1780.0, + "completions/mean_length": 1281.84375, + "completions/mean_terminated_length": 1281.84375, + "completions/min_length": 966.4, + "completions/min_terminated_length": 966.4, + "entropy": 0.2782359480857849, + "epoch": 3.8131609870740304, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.621003270149231, + "learning_rate": 1.2248606736127939e-07, + "loss": -0.0019, + "num_tokens": 442625328.0, + "reward": 0.9307291746139527, + "reward_std": 0.0730149507522583, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9307291865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1572045348584652, + "sampling/importance_sampling_ratio/max": 1.9717673301696776, + "sampling/importance_sampling_ratio/mean": 0.9999985694885254, + "sampling/importance_sampling_ratio/min": 0.28547490313649176, + "sampling/sampling_logp_difference/max": 1.4838282227516175, + "sampling/sampling_logp_difference/mean": 0.01399837527424097, + "step": 3245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1660.6, + "completions/max_terminated_length": 1660.6, + "completions/mean_length": 1281.51875, + "completions/mean_terminated_length": 1281.51875, + "completions/min_length": 981.8, + "completions/min_terminated_length": 981.8, + "entropy": 0.28906986117362976, + "epoch": 3.8190364277320796, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.572178304195404, + "learning_rate": 1.2188030046038282e-07, + "loss": -0.0023, + "num_tokens": 443347766.0, + "reward": 0.8393229246139526, + "reward_std": 0.07080771774053574, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8393229246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23748641908168794, + "sampling/importance_sampling_ratio/max": 1.9597267866134644, + "sampling/importance_sampling_ratio/mean": 0.9999413132667542, + "sampling/importance_sampling_ratio/min": 0.2964545637369156, + "sampling/sampling_logp_difference/max": 1.229891586303711, + "sampling/sampling_logp_difference/mean": 0.014197415299713611, + "step": 3250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1707.8, + "completions/max_terminated_length": 1707.8, + "completions/mean_length": 1252.590625, + "completions/mean_terminated_length": 1252.590625, + "completions/min_length": 884.6, + "completions/min_terminated_length": 884.6, + "entropy": 0.2784834265708923, + "epoch": 3.8249118683901293, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.39195069670677185, + "learning_rate": 1.212745335594863e-07, + "loss": 0.0071, + "num_tokens": 444042115.0, + "reward": 0.8630208492279052, + "reward_std": 0.06553929708898068, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8630208492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2423491060733795, + "sampling/importance_sampling_ratio/max": 1.9346032381057738, + "sampling/importance_sampling_ratio/mean": 1.0000757694244384, + "sampling/importance_sampling_ratio/min": 0.31034799516201017, + "sampling/sampling_logp_difference/max": 1.2329391956329345, + "sampling/sampling_logp_difference/mean": 0.013708932884037494, + "step": 3255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1977.4, + "completions/max_terminated_length": 1880.0, + "completions/mean_length": 1329.4125, + "completions/mean_terminated_length": 1325.8530029296876, + "completions/min_length": 1028.4, + "completions/min_terminated_length": 1028.4, + "entropy": 0.2909530997276306, + "epoch": 3.8307873090481785, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4510413408279419, + "learning_rate": 1.2066876665858977e-07, + "loss": -0.0081, + "num_tokens": 444796419.0, + "reward": 0.8528645873069763, + "reward_std": 0.0627675049006939, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8528645992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26745030134916303, + "sampling/importance_sampling_ratio/max": 1.9303704738616942, + "sampling/importance_sampling_ratio/mean": 1.0000205278396606, + "sampling/importance_sampling_ratio/min": 0.27986125648299554, + "sampling/sampling_logp_difference/max": 5.938221645355225, + "sampling/sampling_logp_difference/mean": 0.01452437173575163, + "step": 3260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1806.4, + "completions/max_terminated_length": 1806.4, + "completions/mean_length": 1308.70625, + "completions/mean_terminated_length": 1308.70625, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "entropy": 0.2953240931034088, + "epoch": 3.8366627497062282, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 1.2006299975769324e-07, + "loss": -0.0009, + "num_tokens": 445569973.0, + "reward": 0.8123437643051148, + "reward_std": 0.05132426992058754, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8123437643051148, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26707516610622406, + "sampling/importance_sampling_ratio/max": 1.9395072221755982, + "sampling/importance_sampling_ratio/mean": 1.0000399947166443, + "sampling/importance_sampling_ratio/min": 0.33533908128738404, + "sampling/sampling_logp_difference/max": 1.1794774770736693, + "sampling/sampling_logp_difference/mean": 0.014524004608392715, + "step": 3265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1771.0, + "completions/max_terminated_length": 1771.0, + "completions/mean_length": 1274.678125, + "completions/mean_terminated_length": 1274.678125, + "completions/min_length": 869.8, + "completions/min_terminated_length": 869.8, + "entropy": 0.2809648633003235, + "epoch": 3.8425381903642775, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.36577996611595154, + "learning_rate": 1.194572328567967e-07, + "loss": 0.0009, + "num_tokens": 446295278.0, + "reward": 0.9208333373069764, + "reward_std": 0.04342363029718399, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9208333373069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1181575708091259, + "sampling/importance_sampling_ratio/max": 1.9795986413955688, + "sampling/importance_sampling_ratio/mean": 1.0001219749450683, + "sampling/importance_sampling_ratio/min": 0.31653355807065964, + "sampling/sampling_logp_difference/max": 1.2452943086624146, + "sampling/sampling_logp_difference/mean": 0.013978814147412777, + "step": 3270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1794.0, + "completions/max_terminated_length": 1794.0, + "completions/mean_length": 1265.828125, + "completions/mean_terminated_length": 1265.828125, + "completions/min_length": 865.8, + "completions/min_terminated_length": 865.8, + "entropy": 0.28681405186653136, + "epoch": 3.8484136310223267, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7446997165679932, + "learning_rate": 1.1885146595590016e-07, + "loss": 0.0069, + "num_tokens": 447073655.0, + "reward": 0.8171354174613953, + "reward_std": 0.09392708986997604, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8171354174613953, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27176968157291415, + "sampling/importance_sampling_ratio/max": 1.941535973548889, + "sampling/importance_sampling_ratio/mean": 1.000026822090149, + "sampling/importance_sampling_ratio/min": 0.37527463138103484, + "sampling/sampling_logp_difference/max": 1.0598471283912658, + "sampling/sampling_logp_difference/mean": 0.014365506730973721, + "step": 3275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1762.8, + "completions/max_terminated_length": 1762.8, + "completions/mean_length": 1243.915625, + "completions/mean_terminated_length": 1243.915625, + "completions/min_length": 941.4, + "completions/min_terminated_length": 941.4, + "entropy": 0.26819110810756686, + "epoch": 3.854289071680376, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6066403985023499, + "learning_rate": 1.1824569905500363e-07, + "loss": -0.0064, + "num_tokens": 447775020.0, + "reward": 0.9101562619209289, + "reward_std": 0.07682659178972244, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9101562619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1867051303386688, + "sampling/importance_sampling_ratio/max": 1.9701864957809447, + "sampling/importance_sampling_ratio/mean": 1.00002464056015, + "sampling/importance_sampling_ratio/min": 0.3514017522335052, + "sampling/sampling_logp_difference/max": 1.1939729452133179, + "sampling/sampling_logp_difference/mean": 0.013550216145813466, + "step": 3280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1740.2, + "completions/max_terminated_length": 1740.2, + "completions/mean_length": 1230.046875, + "completions/mean_terminated_length": 1230.046875, + "completions/min_length": 923.4, + "completions/min_terminated_length": 923.4, + "entropy": 0.27420614361763, + "epoch": 3.860164512338425, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.4645025432109833, + "learning_rate": 1.176399321541071e-07, + "loss": -0.0008, + "num_tokens": 448493579.0, + "reward": 0.7045312643051147, + "reward_std": 0.05252151843160391, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7045312643051147, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3159709542989731, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999969244003296, + "sampling/importance_sampling_ratio/min": 0.31865512803196905, + "sampling/sampling_logp_difference/max": 1.5703006744384767, + "sampling/sampling_logp_difference/mean": 0.013808564841747284, + "step": 3285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1857.2, + "completions/max_terminated_length": 1857.2, + "completions/mean_length": 1297.978125, + "completions/mean_terminated_length": 1297.978125, + "completions/min_length": 910.6, + "completions/min_terminated_length": 910.6, + "entropy": 0.28309070467948916, + "epoch": 3.866039952996475, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.47286638617515564, + "learning_rate": 1.1703416525321055e-07, + "loss": -0.009, + "num_tokens": 449245108.0, + "reward": 0.8286458492279053, + "reward_std": 0.09949100911617278, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8286458492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21464731693267822, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000564217567445, + "sampling/importance_sampling_ratio/min": 0.40592106580734255, + "sampling/sampling_logp_difference/max": 0.982337212562561, + "sampling/sampling_logp_difference/mean": 0.01408249158412218, + "step": 3290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.0, + "completions/max_terminated_length": 1622.0, + "completions/mean_length": 1222.871875, + "completions/mean_terminated_length": 1222.871875, + "completions/min_length": 878.2, + "completions/min_terminated_length": 878.2, + "entropy": 0.28852399289608, + "epoch": 3.871915393654524, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5962092280387878, + "learning_rate": 1.1642839835231403e-07, + "loss": 0.005, + "num_tokens": 449982747.0, + "reward": 0.7946875214576721, + "reward_std": 0.05859446972608566, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7946875333786011, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2911753743886948, + "sampling/importance_sampling_ratio/max": 1.9743703126907348, + "sampling/importance_sampling_ratio/mean": 1.000085210800171, + "sampling/importance_sampling_ratio/min": 0.3090327255427837, + "sampling/sampling_logp_difference/max": 1.462190842628479, + "sampling/sampling_logp_difference/mean": 0.014431641064584255, + "step": 3295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1854.8, + "completions/max_terminated_length": 1854.8, + "completions/mean_length": 1340.121875, + "completions/mean_terminated_length": 1340.121875, + "completions/min_length": 984.0, + "completions/min_terminated_length": 984.0, + "entropy": 0.27524838745594027, + "epoch": 3.8777908343125733, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.488313227891922, + "learning_rate": 1.1582263145141749e-07, + "loss": 0.001, + "num_tokens": 450719506.0, + "reward": 0.83671875, + "reward_std": 0.07024868726730346, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.836718761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24404936730861665, + "sampling/importance_sampling_ratio/max": 1.9955852031707764, + "sampling/importance_sampling_ratio/mean": 0.9999388694763184, + "sampling/importance_sampling_ratio/min": 0.33213537335395815, + "sampling/sampling_logp_difference/max": 1.3013247728347779, + "sampling/sampling_logp_difference/mean": 0.01370444092899561, + "step": 3300 + }, + { + "epoch": 3.8777908343125733, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1689.44, + "eval_completions/max_terminated_length": 1689.44, + "eval_completions/mean_length": 1224.116875, + "eval_completions/mean_terminated_length": 1224.116875, + "eval_completions/min_length": 919.28, + "eval_completions/min_terminated_length": 919.28, + "eval_entropy": 0.2747154176235199, + "eval_frac_reward_zero_std": 0.63, + "eval_loss": 0.0016999093350023031, + "eval_num_tokens": 450719506.0, + "eval_reward": 0.77307293176651, + "eval_reward_std": 0.07358525022864341, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.77307293176651, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2892902755737305, + "eval_runtime": 460.3218, + "eval_samples_per_second": 0.217, + "eval_sampling/importance_sampling_ratio/max": 1.9607455730438232, + "eval_sampling/importance_sampling_ratio/mean": 0.9999883246421813, + "eval_sampling/importance_sampling_ratio/min": 0.3240952134691179, + "eval_sampling/sampling_logp_difference/max": 1.5946112561225891, + "eval_sampling/sampling_logp_difference/mean": 0.013883443474769592, + "eval_steps_per_second": 0.004, + "step": 3300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1860.4, + "completions/max_terminated_length": 1860.4, + "completions/mean_length": 1337.828125, + "completions/mean_terminated_length": 1337.828125, + "completions/min_length": 1008.6, + "completions/min_terminated_length": 1008.6, + "entropy": 0.274376255273819, + "epoch": 3.883666274970623, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.4430628716945648, + "learning_rate": 1.1521686455052095e-07, + "loss": 0.0013, + "num_tokens": 451469419.0, + "reward": 0.8466145992279053, + "reward_std": 0.051951204985380174, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8466145992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2435604065656662, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000598907470704, + "sampling/importance_sampling_ratio/min": 0.2840733528137207, + "sampling/sampling_logp_difference/max": 1.5291481018066406, + "sampling/sampling_logp_difference/mean": 0.013875341042876244, + "step": 3305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1777.4, + "completions/max_terminated_length": 1777.4, + "completions/mean_length": 1238.178125, + "completions/mean_terminated_length": 1238.178125, + "completions/min_length": 889.6, + "completions/min_terminated_length": 889.6, + "entropy": 0.2748557984828949, + "epoch": 3.8895417156286722, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.46624255180358887, + "learning_rate": 1.1461109764962442e-07, + "loss": 0.0037, + "num_tokens": 452191012.0, + "reward": 0.9239583492279053, + "reward_std": 0.06863614469766617, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9239583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.15527809858322145, + "sampling/importance_sampling_ratio/max": 1.9529468297958374, + "sampling/importance_sampling_ratio/mean": 0.9999673128128052, + "sampling/importance_sampling_ratio/min": 0.35668731927871705, + "sampling/sampling_logp_difference/max": 1.2592904806137084, + "sampling/sampling_logp_difference/mean": 0.013761545717716216, + "step": 3310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1729.8, + "completions/max_terminated_length": 1729.8, + "completions/mean_length": 1261.10625, + "completions/mean_terminated_length": 1261.10625, + "completions/min_length": 979.8, + "completions/min_terminated_length": 979.8, + "entropy": 0.27796257734298707, + "epoch": 3.8954171562867215, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4277585446834564, + "learning_rate": 1.1400533074872788e-07, + "loss": 0.0018, + "num_tokens": 452904374.0, + "reward": 0.909375011920929, + "reward_std": 0.062048446759581564, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.909375011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17083195745944976, + "sampling/importance_sampling_ratio/max": 1.863189125061035, + "sampling/importance_sampling_ratio/mean": 0.9999260902404785, + "sampling/importance_sampling_ratio/min": 0.3553848028182983, + "sampling/sampling_logp_difference/max": 1.1737365007400513, + "sampling/sampling_logp_difference/mean": 0.013707248121500015, + "step": 3315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1736.8, + "completions/max_terminated_length": 1736.8, + "completions/mean_length": 1264.659375, + "completions/mean_terminated_length": 1264.659375, + "completions/min_length": 948.2, + "completions/min_terminated_length": 948.2, + "entropy": 0.27279492020606994, + "epoch": 3.9012925969447707, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.650298535823822, + "learning_rate": 1.1339956384783135e-07, + "loss": -0.0041, + "num_tokens": 453642489.0, + "reward": 0.846875011920929, + "reward_std": 0.0856197141110897, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.846875011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25374895632266997, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000032126903534, + "sampling/importance_sampling_ratio/min": 0.3886173486709595, + "sampling/sampling_logp_difference/max": 1.1749614000320434, + "sampling/sampling_logp_difference/mean": 0.01352920550853014, + "step": 3320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1809.2, + "completions/max_terminated_length": 1753.6, + "completions/mean_length": 1218.434375, + "completions/mean_terminated_length": 1215.080126953125, + "completions/min_length": 850.4, + "completions/min_terminated_length": 850.4, + "entropy": 0.25508340895175935, + "epoch": 3.90716803760282, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.0, + "learning_rate": 1.1279379694693482e-07, + "loss": -0.0036, + "num_tokens": 454369584.0, + "reward": 0.8700520873069764, + "reward_std": 0.02701122909784317, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8700520873069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20110743045806884, + "sampling/importance_sampling_ratio/max": 1.851835560798645, + "sampling/importance_sampling_ratio/mean": 1.000059926509857, + "sampling/importance_sampling_ratio/min": 0.34435550272464754, + "sampling/sampling_logp_difference/max": 1.1637211084365844, + "sampling/sampling_logp_difference/mean": 0.013143818266689778, + "step": 3325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1748.0, + "completions/max_terminated_length": 1748.0, + "completions/mean_length": 1249.828125, + "completions/mean_terminated_length": 1249.828125, + "completions/min_length": 952.8, + "completions/min_terminated_length": 952.8, + "entropy": 0.2764308452606201, + "epoch": 3.9130434782608696, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.49342775344848633, + "learning_rate": 1.1218803004603827e-07, + "loss": 0.0013, + "num_tokens": 455124777.0, + "reward": 0.8270833492279053, + "reward_std": 0.06940719485282898, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8270833492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3089472085237503, + "sampling/importance_sampling_ratio/max": 1.8889091730117797, + "sampling/importance_sampling_ratio/mean": 0.9999609589576721, + "sampling/importance_sampling_ratio/min": 0.3440424233675003, + "sampling/sampling_logp_difference/max": 1.1790516376495361, + "sampling/sampling_logp_difference/mean": 0.01391413640230894, + "step": 3330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1740.6, + "completions/max_terminated_length": 1740.6, + "completions/mean_length": 1237.1875, + "completions/mean_terminated_length": 1237.1875, + "completions/min_length": 922.6, + "completions/min_terminated_length": 922.6, + "entropy": 0.2558923900127411, + "epoch": 3.918918918918919, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.6352602243423462, + "learning_rate": 1.1158226314514174e-07, + "loss": -0.0015, + "num_tokens": 455869669.0, + "reward": 0.770312511920929, + "reward_std": 0.03016253113746643, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.770312511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2921178758144379, + "sampling/importance_sampling_ratio/max": 1.9723082304000854, + "sampling/importance_sampling_ratio/mean": 1.0000006914138795, + "sampling/importance_sampling_ratio/min": 0.3538441300392151, + "sampling/sampling_logp_difference/max": 1.0752784729003906, + "sampling/sampling_logp_difference/mean": 0.01325883362442255, + "step": 3335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1676.0, + "completions/max_terminated_length": 1676.0, + "completions/mean_length": 1213.696875, + "completions/mean_terminated_length": 1213.696875, + "completions/min_length": 882.2, + "completions/min_terminated_length": 882.2, + "entropy": 0.2639861524105072, + "epoch": 3.9247943595769685, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 1.109764962442452e-07, + "loss": -0.0031, + "num_tokens": 456588324.0, + "reward": 0.8276041746139526, + "reward_std": 0.045714473351836205, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8276041746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2570072665810585, + "sampling/importance_sampling_ratio/max": 1.9386536121368407, + "sampling/importance_sampling_ratio/mean": 0.9998268485069275, + "sampling/importance_sampling_ratio/min": 0.3666181623935699, + "sampling/sampling_logp_difference/max": 1.0288118839263916, + "sampling/sampling_logp_difference/mean": 0.013535234890878201, + "step": 3340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 1798.0, + "completions/max_terminated_length": 1796.4, + "completions/mean_length": 1286.8375, + "completions/mean_terminated_length": 1271.7397705078124, + "completions/min_length": 940.4, + "completions/min_terminated_length": 940.4, + "entropy": 0.2659426271915436, + "epoch": 3.9306698002350178, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5957963466644287, + "learning_rate": 1.1037072934334868e-07, + "loss": -0.0059, + "num_tokens": 457293020.0, + "reward": 0.9078125119209289, + "reward_std": 0.057849539816379546, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9078125119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1975775107741356, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999847173690796, + "sampling/importance_sampling_ratio/min": 0.31824939250946044, + "sampling/sampling_logp_difference/max": 1.3847343921661377, + "sampling/sampling_logp_difference/mean": 0.013584697060286999, + "step": 3345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1622.0, + "completions/max_terminated_length": 1622.0, + "completions/mean_length": 1119.728125, + "completions/mean_terminated_length": 1119.728125, + "completions/min_length": 785.4, + "completions/min_terminated_length": 785.4, + "entropy": 0.24143437743186952, + "epoch": 3.936545240893067, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4451583921909332, + "learning_rate": 1.0976496244245213e-07, + "loss": 0.004, + "num_tokens": 457972389.0, + "reward": 0.8882812619209289, + "reward_std": 0.07290575057268142, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8882812619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19074126631021499, + "sampling/importance_sampling_ratio/max": 1.9808499813079834, + "sampling/importance_sampling_ratio/mean": 1.0000173211097718, + "sampling/importance_sampling_ratio/min": 0.3381476104259491, + "sampling/sampling_logp_difference/max": 1.2803590774536133, + "sampling/sampling_logp_difference/mean": 0.012810366414487361, + "step": 3350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1703.8, + "completions/max_terminated_length": 1703.8, + "completions/mean_length": 1184.834375, + "completions/mean_terminated_length": 1184.834375, + "completions/min_length": 811.2, + "completions/min_terminated_length": 811.2, + "entropy": 0.24822763800621034, + "epoch": 3.9424206815511162, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5334610939025879, + "learning_rate": 1.0915919554155561e-07, + "loss": 0.0016, + "num_tokens": 458701424.0, + "reward": 0.9039583563804626, + "reward_std": 0.08115731552243233, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9039583563804626, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18836814165115356, + "sampling/importance_sampling_ratio/max": 1.980670428276062, + "sampling/importance_sampling_ratio/mean": 1.00001859664917, + "sampling/importance_sampling_ratio/min": 0.35833509400172686, + "sampling/sampling_logp_difference/max": 4.0236934423446655, + "sampling/sampling_logp_difference/mean": 0.0130048006772995, + "step": 3355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1886.0, + "completions/max_terminated_length": 1886.0, + "completions/mean_length": 1239.865625, + "completions/mean_terminated_length": 1239.865625, + "completions/min_length": 899.2, + "completions/min_terminated_length": 899.2, + "entropy": 0.27396275401115416, + "epoch": 3.9482961222091655, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.7187315821647644, + "learning_rate": 1.0855342864065907e-07, + "loss": 0.0056, + "num_tokens": 459467877.0, + "reward": 0.7541666746139526, + "reward_std": 0.1147657498717308, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7541666746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3114150196313858, + "sampling/importance_sampling_ratio/max": 1.89167377948761, + "sampling/importance_sampling_ratio/mean": 0.9999890923500061, + "sampling/importance_sampling_ratio/min": 0.37039981186389925, + "sampling/sampling_logp_difference/max": 1.0398942470550536, + "sampling/sampling_logp_difference/mean": 0.013965315371751785, + "step": 3360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1493.8, + "completions/max_terminated_length": 1493.8, + "completions/mean_length": 1098.35625, + "completions/mean_terminated_length": 1098.35625, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.2534207671880722, + "epoch": 3.954171562867215, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.579468846321106, + "learning_rate": 1.0794766173976253e-07, + "loss": -0.0038, + "num_tokens": 460121783.0, + "reward": 0.868541669845581, + "reward_std": 0.04859443977475166, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.868541669845581, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21250081658363343, + "sampling/importance_sampling_ratio/max": 1.9952316761016846, + "sampling/importance_sampling_ratio/mean": 1.0000939130783082, + "sampling/importance_sampling_ratio/min": 0.39403712153434756, + "sampling/sampling_logp_difference/max": 1.1500419616699218, + "sampling/sampling_logp_difference/mean": 0.01323564574122429, + "step": 3365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1879.0, + "completions/max_terminated_length": 1804.0, + "completions/mean_length": 1251.409375, + "completions/mean_terminated_length": 1241.074267578125, + "completions/min_length": 907.0, + "completions/min_terminated_length": 907.0, + "entropy": 0.2608177125453949, + "epoch": 3.9600470035252644, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5231078863143921, + "learning_rate": 1.07341894838866e-07, + "loss": -0.0226, + "num_tokens": 460847966.0, + "reward": 0.7572916746139526, + "reward_std": 0.060109014809131625, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7572916746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28681475222110747, + "sampling/importance_sampling_ratio/max": 1.9938726425170898, + "sampling/importance_sampling_ratio/mean": 1.0000676155090331, + "sampling/importance_sampling_ratio/min": 0.39298430681228635, + "sampling/sampling_logp_difference/max": 1.071886992454529, + "sampling/sampling_logp_difference/mean": 0.013303074613213539, + "step": 3370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1426.8, + "completions/max_terminated_length": 1426.8, + "completions/mean_length": 1150.00625, + "completions/mean_terminated_length": 1150.00625, + "completions/min_length": 928.8, + "completions/min_terminated_length": 928.8, + "entropy": 0.2832071840763092, + "epoch": 3.9659224441833136, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.3876747190952301, + "learning_rate": 1.0673612793796946e-07, + "loss": -0.0016, + "num_tokens": 461555024.0, + "reward": 0.8479166746139526, + "reward_std": 0.02359100729227066, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8479166746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2031202271580696, + "sampling/importance_sampling_ratio/max": 1.9597638845443726, + "sampling/importance_sampling_ratio/mean": 1.0000397205352782, + "sampling/importance_sampling_ratio/min": 0.4398146092891693, + "sampling/sampling_logp_difference/max": 0.9516366958618164, + "sampling/sampling_logp_difference/mean": 0.014147293195128441, + "step": 3375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1734.8, + "completions/max_terminated_length": 1734.8, + "completions/mean_length": 1196.45625, + "completions/mean_terminated_length": 1196.45625, + "completions/min_length": 912.8, + "completions/min_terminated_length": 912.8, + "entropy": 0.24856521785259247, + "epoch": 3.9717978848413633, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.44269227981567383, + "learning_rate": 1.0613036103707294e-07, + "loss": 0.0011, + "num_tokens": 462241618.0, + "reward": 0.7182291626930237, + "reward_std": 0.058018694072961806, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7182291626930237, + "rewards/e2e_recall_precision_mixed_reward/std": 0.33283857107162473, + "sampling/importance_sampling_ratio/max": 1.9259427070617676, + "sampling/importance_sampling_ratio/mean": 1.0000762820243836, + "sampling/importance_sampling_ratio/min": 0.319977280497551, + "sampling/sampling_logp_difference/max": 1.3419868707656861, + "sampling/sampling_logp_difference/mean": 0.012762147746980191, + "step": 3380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1593.2, + "completions/max_terminated_length": 1593.2, + "completions/mean_length": 1190.53125, + "completions/mean_terminated_length": 1190.53125, + "completions/min_length": 852.0, + "completions/min_terminated_length": 852.0, + "entropy": 0.2616691470146179, + "epoch": 3.9776733254994125, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.4497514069080353, + "learning_rate": 1.055245941361764e-07, + "loss": 0.003, + "num_tokens": 462957900.0, + "reward": 0.896875, + "reward_std": 0.03596546053886414, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.896875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17488451898097992, + "sampling/importance_sampling_ratio/max": 1.987834596633911, + "sampling/importance_sampling_ratio/mean": 1.0000129342079163, + "sampling/importance_sampling_ratio/min": 0.36617528796195986, + "sampling/sampling_logp_difference/max": 1.1533316850662232, + "sampling/sampling_logp_difference/mean": 0.013550573959946632, + "step": 3385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1514.2, + "completions/max_terminated_length": 1514.2, + "completions/mean_length": 1162.390625, + "completions/mean_terminated_length": 1162.390625, + "completions/min_length": 885.8, + "completions/min_terminated_length": 885.8, + "entropy": 0.26850571632385256, + "epoch": 3.983548766157462, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4390755593776703, + "learning_rate": 1.0491882723527986e-07, + "loss": 0.0001, + "num_tokens": 463636649.0, + "reward": 0.7920312643051147, + "reward_std": 0.08977707475423813, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7920312643051147, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29775896966457366, + "sampling/importance_sampling_ratio/max": 1.9499453544616698, + "sampling/importance_sampling_ratio/mean": 0.9999655961990357, + "sampling/importance_sampling_ratio/min": 0.35879728496074675, + "sampling/sampling_logp_difference/max": 1.1991132736206054, + "sampling/sampling_logp_difference/mean": 0.013767444901168347, + "step": 3390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1806.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 1238.225, + "completions/mean_terminated_length": 1238.225, + "completions/min_length": 815.6, + "completions/min_terminated_length": 815.6, + "entropy": 0.2589147299528122, + "epoch": 3.989424206815511, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.3654157817363739, + "learning_rate": 1.0431306033438332e-07, + "loss": 0.001, + "num_tokens": 464341057.0, + "reward": 0.8408854246139527, + "reward_std": 0.10250527374446392, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8408854246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2893193304538727, + "sampling/importance_sampling_ratio/max": 1.986938238143921, + "sampling/importance_sampling_ratio/mean": 1.000028955936432, + "sampling/importance_sampling_ratio/min": 0.2818640649318695, + "sampling/sampling_logp_difference/max": 1.6181657314300537, + "sampling/sampling_logp_difference/mean": 0.013167793862521648, + "step": 3395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1650.2, + "completions/max_terminated_length": 1650.2, + "completions/mean_length": 1230.4125, + "completions/mean_terminated_length": 1230.4125, + "completions/min_length": 968.8, + "completions/min_terminated_length": 968.8, + "entropy": 0.27528418600559235, + "epoch": 3.9952996474735603, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.3999543786048889, + "learning_rate": 1.0370729343348679e-07, + "loss": -0.0034, + "num_tokens": 465059861.0, + "reward": 0.7984375, + "reward_std": 0.07827110588550568, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7984375, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2598001092672348, + "sampling/importance_sampling_ratio/max": 1.9498087167739868, + "sampling/importance_sampling_ratio/mean": 1.0000772595405578, + "sampling/importance_sampling_ratio/min": 0.27214218527078626, + "sampling/sampling_logp_difference/max": 1.4822240352630616, + "sampling/sampling_logp_difference/mean": 0.013907233253121376, + "step": 3400 + }, + { + "epoch": 3.9952996474735603, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1572.6, + "eval_completions/max_terminated_length": 1572.6, + "eval_completions/mean_length": 1161.240625, + "eval_completions/mean_terminated_length": 1161.240625, + "eval_completions/min_length": 880.12, + "eval_completions/min_terminated_length": 880.12, + "eval_entropy": 0.2626095861196518, + "eval_frac_reward_zero_std": 0.6, + "eval_loss": 0.0030293413437902927, + "eval_num_tokens": 465059861.0, + "eval_reward": 0.7789062619209289, + "eval_reward_std": 0.07541791707277298, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7789062619209289, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2894728544354439, + "eval_runtime": 437.9033, + "eval_samples_per_second": 0.228, + "eval_sampling/importance_sampling_ratio/max": 1.937536702156067, + "eval_sampling/importance_sampling_ratio/mean": 1.0000023913383485, + "eval_sampling/importance_sampling_ratio/min": 0.3022413222497363, + "eval_sampling/sampling_logp_difference/max": 2.2783599162101744, + "eval_sampling/sampling_logp_difference/mean": 0.01340147852897644, + "eval_steps_per_second": 0.005, + "step": 3400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1525.2, + "completions/max_terminated_length": 1525.2, + "completions/mean_length": 1220.359375, + "completions/mean_terminated_length": 1220.359375, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.2618350923061371, + "epoch": 4.0011750881316095, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.47610652446746826, + "learning_rate": 1.0310152653259026e-07, + "loss": 0.0047, + "num_tokens": 465757512.0, + "reward": 0.8255208373069763, + "reward_std": 0.07653152495622635, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8255208373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2626977309584618, + "sampling/importance_sampling_ratio/max": 1.943035101890564, + "sampling/importance_sampling_ratio/mean": 0.9999830961227417, + "sampling/importance_sampling_ratio/min": 0.2748304158449173, + "sampling/sampling_logp_difference/max": 1.504104995727539, + "sampling/sampling_logp_difference/mean": 0.013258552365005016, + "step": 3405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1754.2, + "completions/max_terminated_length": 1754.2, + "completions/mean_length": 1229.028125, + "completions/mean_terminated_length": 1229.028125, + "completions/min_length": 971.6, + "completions/min_terminated_length": 971.6, + "entropy": 0.2637974351644516, + "epoch": 4.00705052878966, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6931338310241699, + "learning_rate": 1.0249575963169373e-07, + "loss": 0.0016, + "num_tokens": 466462977.0, + "reward": 0.8789583444595337, + "reward_std": 0.09424636662006378, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8789583444595337, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1916855752468109, + "sampling/importance_sampling_ratio/max": 1.9907166004180907, + "sampling/importance_sampling_ratio/mean": 1.000013256072998, + "sampling/importance_sampling_ratio/min": 0.25232034027576444, + "sampling/sampling_logp_difference/max": 1.4631556510925292, + "sampling/sampling_logp_difference/mean": 0.013366755843162537, + "step": 3410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1727.6, + "completions/max_terminated_length": 1727.6, + "completions/mean_length": 1218.85625, + "completions/mean_terminated_length": 1218.85625, + "completions/min_length": 871.2, + "completions/min_terminated_length": 871.2, + "entropy": 0.2636944532394409, + "epoch": 4.012925969447709, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.49683284759521484, + "learning_rate": 1.0188999273079718e-07, + "loss": 0.0018, + "num_tokens": 467176147.0, + "reward": 0.864062511920929, + "reward_std": 0.07282592691481113, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8640625238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25493159517645836, + "sampling/importance_sampling_ratio/max": 1.9635956287384033, + "sampling/importance_sampling_ratio/mean": 0.9999071478843689, + "sampling/importance_sampling_ratio/min": 0.31713399589061736, + "sampling/sampling_logp_difference/max": 1.2184108018875122, + "sampling/sampling_logp_difference/mean": 0.01336588580161333, + "step": 3415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1758.2, + "completions/max_terminated_length": 1758.2, + "completions/mean_length": 1226.559375, + "completions/mean_terminated_length": 1226.559375, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "entropy": 0.25220133662223815, + "epoch": 4.018801410105758, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 1.0128422582990065e-07, + "loss": -0.0008, + "num_tokens": 467866534.0, + "reward": 0.9083333373069763, + "reward_std": 0.05359421372413635, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9083333373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1494651608169079, + "sampling/importance_sampling_ratio/max": 1.974375057220459, + "sampling/importance_sampling_ratio/mean": 1.0000734686851502, + "sampling/importance_sampling_ratio/min": 0.34163759648799896, + "sampling/sampling_logp_difference/max": 1.3871023654937744, + "sampling/sampling_logp_difference/mean": 0.012934430874884129, + "step": 3420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1483.4, + "completions/max_terminated_length": 1483.4, + "completions/mean_length": 1160.09375, + "completions/mean_terminated_length": 1160.09375, + "completions/min_length": 892.4, + "completions/min_terminated_length": 892.4, + "entropy": 0.2803857684135437, + "epoch": 4.024676850763807, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6446830630302429, + "learning_rate": 1.0067845892900411e-07, + "loss": -0.0032, + "num_tokens": 468585380.0, + "reward": 0.8739583492279053, + "reward_std": 0.07574607878923416, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8739583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1989564597606659, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999167919158936, + "sampling/importance_sampling_ratio/min": 0.32423160076141355, + "sampling/sampling_logp_difference/max": 1.2724119901657105, + "sampling/sampling_logp_difference/mean": 0.01405625492334366, + "step": 3425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1626.0, + "completions/max_terminated_length": 1626.0, + "completions/mean_length": 1228.2875, + "completions/mean_terminated_length": 1228.2875, + "completions/min_length": 988.2, + "completions/min_terminated_length": 988.2, + "entropy": 0.265902704000473, + "epoch": 4.030552291421857, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6040170788764954, + "learning_rate": 1.0007269202810759e-07, + "loss": -0.0004, + "num_tokens": 469277968.0, + "reward": 0.8377604365348816, + "reward_std": 0.08370122164487839, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8377604365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2523147314786911, + "sampling/importance_sampling_ratio/max": 1.9924169778823853, + "sampling/importance_sampling_ratio/mean": 0.9999254226684571, + "sampling/importance_sampling_ratio/min": 0.3520743578672409, + "sampling/sampling_logp_difference/max": 1.206534743309021, + "sampling/sampling_logp_difference/mean": 0.013386547565460205, + "step": 3430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1853.8, + "completions/max_terminated_length": 1853.8, + "completions/mean_length": 1253.959375, + "completions/mean_terminated_length": 1253.959375, + "completions/min_length": 908.4, + "completions/min_terminated_length": 908.4, + "entropy": 0.26015831232070924, + "epoch": 4.036427732079906, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4293096363544464, + "learning_rate": 9.946692512721104e-08, + "loss": -0.0044, + "num_tokens": 470030387.0, + "reward": 0.73828125, + "reward_std": 0.06746623069047927, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.73828125, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3138516306877136, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999528050422668, + "sampling/importance_sampling_ratio/min": 0.292254401743412, + "sampling/sampling_logp_difference/max": 1.4798493027687072, + "sampling/sampling_logp_difference/mean": 0.013409636914730072, + "step": 3435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1659.4, + "completions/max_terminated_length": 1593.2, + "completions/mean_length": 1152.0875, + "completions/mean_terminated_length": 1144.1885986328125, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "entropy": 0.2462655335664749, + "epoch": 4.042303172737955, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.9179558157920837, + "learning_rate": 9.88611582263145e-08, + "loss": -0.0058, + "num_tokens": 470680119.0, + "reward": 0.9131770968437195, + "reward_std": 0.06535822451114655, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9131770968437195, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18944206982851028, + "sampling/importance_sampling_ratio/max": 1.987571358680725, + "sampling/importance_sampling_ratio/mean": 0.9999889135360718, + "sampling/importance_sampling_ratio/min": 0.42287402153015136, + "sampling/sampling_logp_difference/max": 0.9650676608085632, + "sampling/sampling_logp_difference/mean": 0.01276344656944275, + "step": 3440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1661.4, + "completions/max_terminated_length": 1661.4, + "completions/mean_length": 1190.309375, + "completions/mean_terminated_length": 1190.309375, + "completions/min_length": 872.8, + "completions/min_terminated_length": 872.8, + "entropy": 0.27079584300518034, + "epoch": 4.048178613396004, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 9.825539132541798e-08, + "loss": 0.0053, + "num_tokens": 471372362.0, + "reward": 0.8203125119209289, + "reward_std": 0.03124999850988388, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8203125119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2445871613919735, + "sampling/importance_sampling_ratio/max": 1.9853496074676513, + "sampling/importance_sampling_ratio/mean": 0.9999292612075805, + "sampling/importance_sampling_ratio/min": 0.3703574028797448, + "sampling/sampling_logp_difference/max": 1.7025861740112305, + "sampling/sampling_logp_difference/mean": 0.01366796400398016, + "step": 3445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1731.2, + "completions/max_terminated_length": 1731.2, + "completions/mean_length": 1228.340625, + "completions/mean_terminated_length": 1228.340625, + "completions/min_length": 884.4, + "completions/min_terminated_length": 884.4, + "entropy": 0.2743656039237976, + "epoch": 4.054054054054054, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6055347323417664, + "learning_rate": 9.764962442452144e-08, + "loss": 0.003, + "num_tokens": 472078599.0, + "reward": 0.8036458492279053, + "reward_std": 0.08734508380293846, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8036458492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26783247590065, + "sampling/importance_sampling_ratio/max": 1.965476965904236, + "sampling/importance_sampling_ratio/mean": 1.0000924825668336, + "sampling/importance_sampling_ratio/min": 0.3593331933021545, + "sampling/sampling_logp_difference/max": 1.0880284786224366, + "sampling/sampling_logp_difference/mean": 0.013652561791241169, + "step": 3450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1785.0, + "completions/max_terminated_length": 1785.0, + "completions/mean_length": 1229.25, + "completions/mean_terminated_length": 1229.25, + "completions/min_length": 882.8, + "completions/min_terminated_length": 882.8, + "entropy": 0.27147723734378815, + "epoch": 4.059929494712104, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.573058009147644, + "learning_rate": 9.70438575236249e-08, + "loss": 0.0021, + "num_tokens": 472810231.0, + "reward": 0.9270833373069763, + "reward_std": 0.05796501636505127, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9270833373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17348659336566924, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001057624816894, + "sampling/importance_sampling_ratio/min": 0.3474849671125412, + "sampling/sampling_logp_difference/max": 1.1374139070510865, + "sampling/sampling_logp_difference/mean": 0.013674916699528694, + "step": 3455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1920.2, + "completions/max_terminated_length": 1883.6, + "completions/mean_length": 1295.409375, + "completions/mean_terminated_length": 1291.970556640625, + "completions/min_length": 890.2, + "completions/min_terminated_length": 890.2, + "entropy": 0.26137855648994446, + "epoch": 4.065804935370153, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5707703232765198, + "learning_rate": 9.643809062272837e-08, + "loss": -0.0021, + "num_tokens": 473540566.0, + "reward": 0.8479166865348816, + "reward_std": 0.056248662620782854, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8479166865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25568803399801254, + "sampling/importance_sampling_ratio/max": 1.9878350973129273, + "sampling/importance_sampling_ratio/mean": 0.9999929904937744, + "sampling/importance_sampling_ratio/min": 0.35799447596073153, + "sampling/sampling_logp_difference/max": 1.1041216850280762, + "sampling/sampling_logp_difference/mean": 0.013470960408449173, + "step": 3460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1668.8, + "completions/max_terminated_length": 1668.8, + "completions/mean_length": 1234.76875, + "completions/mean_terminated_length": 1234.76875, + "completions/min_length": 984.6, + "completions/min_terminated_length": 984.6, + "entropy": 0.27419663667678834, + "epoch": 4.071680376028202, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6924020648002625, + "learning_rate": 9.583232372183183e-08, + "loss": 0.0031, + "num_tokens": 474250844.0, + "reward": 0.9218750119209289, + "reward_std": 0.07348827123641968, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9218750119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1793515682220459, + "sampling/importance_sampling_ratio/max": 1.9537212610244752, + "sampling/importance_sampling_ratio/mean": 0.9998884320259094, + "sampling/importance_sampling_ratio/min": 0.316797736287117, + "sampling/sampling_logp_difference/max": 1.2259002208709717, + "sampling/sampling_logp_difference/mean": 0.013919955492019654, + "step": 3465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1587.4, + "completions/max_terminated_length": 1587.4, + "completions/mean_length": 1218.278125, + "completions/mean_terminated_length": 1218.278125, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.2593968540430069, + "epoch": 4.077555816686251, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.521091103553772, + "learning_rate": 9.522655682093531e-08, + "loss": -0.0016, + "num_tokens": 474963797.0, + "reward": 0.8380208492279053, + "reward_std": 0.08670372664928436, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8380208492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2724744647741318, + "sampling/importance_sampling_ratio/max": 1.9798192977905273, + "sampling/importance_sampling_ratio/mean": 1.000128412246704, + "sampling/importance_sampling_ratio/min": 0.38998249769210813, + "sampling/sampling_logp_difference/max": 0.9741350650787354, + "sampling/sampling_logp_difference/mean": 0.013235159032046796, + "step": 3470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00625, + "completions/max_length": 1724.2, + "completions/max_terminated_length": 1707.4, + "completions/mean_length": 1228.634375, + "completions/mean_terminated_length": 1221.1024169921875, + "completions/min_length": 919.8, + "completions/min_terminated_length": 919.8, + "entropy": 0.2747196197509766, + "epoch": 4.083431257344301, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4292539060115814, + "learning_rate": 9.462078992003876e-08, + "loss": -0.0207, + "num_tokens": 475676008.0, + "reward": 0.83125, + "reward_std": 0.06916316822171212, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.831250011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2779860496520996, + "sampling/importance_sampling_ratio/max": 1.9322911977767945, + "sampling/importance_sampling_ratio/mean": 1.0001016497611999, + "sampling/importance_sampling_ratio/min": 0.3249115705490112, + "sampling/sampling_logp_difference/max": 1.2032855987548827, + "sampling/sampling_logp_difference/mean": 0.013829777017235756, + "step": 3475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1812.8, + "completions/max_terminated_length": 1812.8, + "completions/mean_length": 1300.915625, + "completions/mean_terminated_length": 1300.915625, + "completions/min_length": 1010.0, + "completions/min_terminated_length": 1010.0, + "entropy": 0.2737528860569, + "epoch": 4.08930669800235, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.8069294095039368, + "learning_rate": 9.401502301914223e-08, + "loss": 0.0012, + "num_tokens": 476414573.0, + "reward": 0.7744791626930236, + "reward_std": 0.10844443291425705, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7744791746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3621998965740204, + "sampling/importance_sampling_ratio/max": 1.9878645896911622, + "sampling/importance_sampling_ratio/mean": 1.0000685095787047, + "sampling/importance_sampling_ratio/min": 0.3133451998233795, + "sampling/sampling_logp_difference/max": 1.2502954721450805, + "sampling/sampling_logp_difference/mean": 0.013823360577225685, + "step": 3480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1492.4, + "completions/max_terminated_length": 1492.4, + "completions/mean_length": 1175.41875, + "completions/mean_terminated_length": 1175.41875, + "completions/min_length": 953.0, + "completions/min_terminated_length": 953.0, + "entropy": 0.2621743202209473, + "epoch": 4.0951821386604, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 9.34092561182457e-08, + "loss": -0.0049, + "num_tokens": 477100547.0, + "reward": 0.83203125, + "reward_std": 0.034580792486667636, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.83203125, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2857617437839508, + "sampling/importance_sampling_ratio/max": 1.9388158798217774, + "sampling/importance_sampling_ratio/mean": 0.9999837040901184, + "sampling/importance_sampling_ratio/min": 0.41814273595809937, + "sampling/sampling_logp_difference/max": 1.050565242767334, + "sampling/sampling_logp_difference/mean": 0.013383341580629348, + "step": 3485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009375, + "completions/max_length": 1808.8, + "completions/max_terminated_length": 1793.2, + "completions/mean_length": 1210.359375, + "completions/mean_terminated_length": 1200.5919921875, + "completions/min_length": 831.4, + "completions/min_terminated_length": 831.4, + "entropy": 0.25503125190734866, + "epoch": 4.101057579318449, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0, + "learning_rate": 9.280348921734917e-08, + "loss": -0.0018, + "num_tokens": 477827738.0, + "reward": 0.7958333492279053, + "reward_std": 0.0934183917939663, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7958333492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26625967025756836, + "sampling/importance_sampling_ratio/max": 1.9422680139541626, + "sampling/importance_sampling_ratio/mean": 1.0000776171684265, + "sampling/importance_sampling_ratio/min": 0.2969828426837921, + "sampling/sampling_logp_difference/max": 1.300187087059021, + "sampling/sampling_logp_difference/mean": 0.013157267309725284, + "step": 3490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1592.0, + "completions/max_terminated_length": 1592.0, + "completions/mean_length": 1163.596875, + "completions/mean_terminated_length": 1163.596875, + "completions/min_length": 825.2, + "completions/min_terminated_length": 825.2, + "entropy": 0.25895902812480925, + "epoch": 4.106933019976498, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5002437233924866, + "learning_rate": 9.219772231645262e-08, + "loss": -0.001, + "num_tokens": 478509497.0, + "reward": 0.7967187523841858, + "reward_std": 0.06796298734843731, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7967187523841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2967051357030869, + "sampling/importance_sampling_ratio/max": 1.9166396617889405, + "sampling/importance_sampling_ratio/mean": 0.9999278783798218, + "sampling/importance_sampling_ratio/min": 0.33549955785274505, + "sampling/sampling_logp_difference/max": 1.154195499420166, + "sampling/sampling_logp_difference/mean": 0.013070161268115044, + "step": 3495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1446.2, + "completions/max_terminated_length": 1446.2, + "completions/mean_length": 1097.896875, + "completions/mean_terminated_length": 1097.896875, + "completions/min_length": 811.6, + "completions/min_terminated_length": 811.6, + "entropy": 0.26062892377376556, + "epoch": 4.112808460634548, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6695868372917175, + "learning_rate": 9.159195541555608e-08, + "loss": 0.003, + "num_tokens": 479184440.0, + "reward": 0.79411461353302, + "reward_std": 0.06309830695390702, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.79411461353302, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2844283878803253, + "sampling/importance_sampling_ratio/max": 1.9598466873168945, + "sampling/importance_sampling_ratio/mean": 1.0000836849212646, + "sampling/importance_sampling_ratio/min": 0.33525398969650266, + "sampling/sampling_logp_difference/max": 1.1045416712760925, + "sampling/sampling_logp_difference/mean": 0.013297425210475921, + "step": 3500 + }, + { + "epoch": 4.112808460634548, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1565.68, + "eval_completions/max_terminated_length": 1565.68, + "eval_completions/mean_length": 1154.865, + "eval_completions/mean_terminated_length": 1154.865, + "eval_completions/min_length": 871.24, + "eval_completions/min_terminated_length": 871.24, + "eval_entropy": 0.2613798928260803, + "eval_frac_reward_zero_std": 0.61, + "eval_loss": 0.0025717311073094606, + "eval_num_tokens": 479184440.0, + "eval_reward": 0.7733750081062317, + "eval_reward_std": 0.07942094504833222, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7733750081062317, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2978187209367752, + "eval_runtime": 429.2546, + "eval_samples_per_second": 0.233, + "eval_sampling/importance_sampling_ratio/max": 1.929530372619629, + "eval_sampling/importance_sampling_ratio/mean": 0.999969162940979, + "eval_sampling/importance_sampling_ratio/min": 0.3072571662068367, + "eval_sampling/sampling_logp_difference/max": 1.3240701341629029, + "eval_sampling/sampling_logp_difference/mean": 0.013397705145180225, + "eval_steps_per_second": 0.005, + "step": 3500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1559.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 1148.940625, + "completions/mean_terminated_length": 1148.940625, + "completions/min_length": 830.6, + "completions/min_terminated_length": 830.6, + "entropy": 0.25160637497901917, + "epoch": 4.118683901292597, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6452022790908813, + "learning_rate": 9.098618851465956e-08, + "loss": -0.0045, + "num_tokens": 479876965.0, + "reward": 0.8864583492279052, + "reward_std": 0.07887421548366547, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8864583492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19640893638134002, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000006639957428, + "sampling/importance_sampling_ratio/min": 0.3257301330566406, + "sampling/sampling_logp_difference/max": 1.2440511465072632, + "sampling/sampling_logp_difference/mean": 0.013004663959145546, + "step": 3505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1498.8, + "completions/max_terminated_length": 1498.8, + "completions/mean_length": 1141.73125, + "completions/mean_terminated_length": 1141.73125, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "entropy": 0.26218015551567075, + "epoch": 4.124559341950646, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.45590612292289734, + "learning_rate": 9.038042161376302e-08, + "loss": 0.0045, + "num_tokens": 480554591.0, + "reward": 0.8541666746139527, + "reward_std": 0.077769835293293, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8541666746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28215756416320803, + "sampling/importance_sampling_ratio/max": 1.9558951377868652, + "sampling/importance_sampling_ratio/mean": 1.0000158309936524, + "sampling/importance_sampling_ratio/min": 0.33649215698242185, + "sampling/sampling_logp_difference/max": 1.1709180116653441, + "sampling/sampling_logp_difference/mean": 0.013481209240853786, + "step": 3510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1506.4, + "completions/max_terminated_length": 1506.4, + "completions/mean_length": 1168.775, + "completions/mean_terminated_length": 1168.775, + "completions/min_length": 920.2, + "completions/min_terminated_length": 920.2, + "entropy": 0.26049660742282865, + "epoch": 4.130434782608695, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6427649259567261, + "learning_rate": 8.977465471286649e-08, + "loss": -0.0023, + "num_tokens": 481269495.0, + "reward": 0.686718761920929, + "reward_std": 0.06895458400249481, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.686718761920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3957706391811371, + "sampling/importance_sampling_ratio/max": 1.9636125326156617, + "sampling/importance_sampling_ratio/mean": 1.0000577330589295, + "sampling/importance_sampling_ratio/min": 0.28597378432750703, + "sampling/sampling_logp_difference/max": 1.266392183303833, + "sampling/sampling_logp_difference/mean": 0.013294227421283722, + "step": 3515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1519.8, + "completions/max_terminated_length": 1519.8, + "completions/mean_length": 1163.378125, + "completions/mean_terminated_length": 1163.378125, + "completions/min_length": 854.4, + "completions/min_terminated_length": 854.4, + "entropy": 0.24826107621192933, + "epoch": 4.136310223266745, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4001827836036682, + "learning_rate": 8.916888781196995e-08, + "loss": -0.0018, + "num_tokens": 481950032.0, + "reward": 0.8513020873069763, + "reward_std": 0.0650397665798664, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8513020873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19886459708213805, + "sampling/importance_sampling_ratio/max": 1.950411081314087, + "sampling/importance_sampling_ratio/mean": 1.0000142931938172, + "sampling/importance_sampling_ratio/min": 0.2998376667499542, + "sampling/sampling_logp_difference/max": 1.3086572647094727, + "sampling/sampling_logp_difference/mean": 0.012804117053747177, + "step": 3520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1745.4, + "completions/max_terminated_length": 1745.4, + "completions/mean_length": 1194.35625, + "completions/mean_terminated_length": 1194.35625, + "completions/min_length": 909.2, + "completions/min_terminated_length": 909.2, + "entropy": 0.2595696121454239, + "epoch": 4.142185663924795, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.41437503695487976, + "learning_rate": 8.856312091107341e-08, + "loss": -0.0007, + "num_tokens": 482659890.0, + "reward": 0.8723958492279053, + "reward_std": 0.04596400782465935, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8723958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2494454562664032, + "sampling/importance_sampling_ratio/max": 1.95485520362854, + "sampling/importance_sampling_ratio/mean": 0.999992847442627, + "sampling/importance_sampling_ratio/min": 0.40674508810043336, + "sampling/sampling_logp_difference/max": 0.9346871614456177, + "sampling/sampling_logp_difference/mean": 0.013200496323406696, + "step": 3525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 1188.909375, + "completions/mean_terminated_length": 1188.909375, + "completions/min_length": 874.4, + "completions/min_terminated_length": 874.4, + "entropy": 0.2728053092956543, + "epoch": 4.148061104582844, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7083518505096436, + "learning_rate": 8.795735401017689e-08, + "loss": 0.0038, + "num_tokens": 483374773.0, + "reward": 0.8651041865348816, + "reward_std": 0.07273668944835662, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8651041865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17529991567134856, + "sampling/importance_sampling_ratio/max": 1.9552690982818604, + "sampling/importance_sampling_ratio/mean": 0.9999983310699463, + "sampling/importance_sampling_ratio/min": 0.3792727530002594, + "sampling/sampling_logp_difference/max": 0.9897878289222717, + "sampling/sampling_logp_difference/mean": 0.013795130141079425, + "step": 3530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1658.8, + "completions/max_terminated_length": 1658.8, + "completions/mean_length": 1235.453125, + "completions/mean_terminated_length": 1235.453125, + "completions/min_length": 992.8, + "completions/min_terminated_length": 992.8, + "entropy": 0.27267765402793886, + "epoch": 4.153936545240893, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.3547857999801636, + "learning_rate": 8.735158710928034e-08, + "loss": -0.0013, + "num_tokens": 484084614.0, + "reward": 0.8630208611488343, + "reward_std": 0.05130138620734215, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8630208611488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20890597105026246, + "sampling/importance_sampling_ratio/max": 1.8556790113449098, + "sampling/importance_sampling_ratio/mean": 0.9998457551002502, + "sampling/importance_sampling_ratio/min": 0.39961166977882384, + "sampling/sampling_logp_difference/max": 0.9650723934173584, + "sampling/sampling_logp_difference/mean": 0.013754782639443875, + "step": 3535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1705.2, + "completions/max_terminated_length": 1705.2, + "completions/mean_length": 1189.584375, + "completions/mean_terminated_length": 1189.584375, + "completions/min_length": 834.8, + "completions/min_terminated_length": 834.8, + "entropy": 0.2683590054512024, + "epoch": 4.159811985898942, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.7073742151260376, + "learning_rate": 8.674582020838381e-08, + "loss": 0.009, + "num_tokens": 484785729.0, + "reward": 0.9041666746139526, + "reward_std": 0.06466917842626571, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9041666746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16571083962917327, + "sampling/importance_sampling_ratio/max": 1.9675647020339966, + "sampling/importance_sampling_ratio/mean": 1.0000088930130004, + "sampling/importance_sampling_ratio/min": 0.28218771507963536, + "sampling/sampling_logp_difference/max": 2.2028767108917235, + "sampling/sampling_logp_difference/mean": 0.013767124712467193, + "step": 3540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1502.6, + "completions/max_terminated_length": 1502.6, + "completions/mean_length": 1172.46875, + "completions/mean_terminated_length": 1172.46875, + "completions/min_length": 920.4, + "completions/min_terminated_length": 920.4, + "entropy": 0.24803606569766998, + "epoch": 4.165687426556992, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0, + "learning_rate": 8.614005330748728e-08, + "loss": -0.0028, + "num_tokens": 485459031.0, + "reward": 0.9213541865348815, + "reward_std": 0.07192991301417351, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9213541865348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.12924561873078347, + "sampling/importance_sampling_ratio/max": 1.9395395040512085, + "sampling/importance_sampling_ratio/mean": 0.9998730182647705, + "sampling/importance_sampling_ratio/min": 0.41810473799705505, + "sampling/sampling_logp_difference/max": 1.0929707050323487, + "sampling/sampling_logp_difference/mean": 0.012722009792923927, + "step": 3545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1635.4, + "completions/max_terminated_length": 1635.4, + "completions/mean_length": 1199.584375, + "completions/mean_terminated_length": 1199.584375, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "entropy": 0.27304658889770506, + "epoch": 4.171562867215041, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6417668461799622, + "learning_rate": 8.553428640659074e-08, + "loss": 0.0015, + "num_tokens": 486185538.0, + "reward": 0.8311979174613953, + "reward_std": 0.09750491976737977, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8311979174613953, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30009780526161195, + "sampling/importance_sampling_ratio/max": 1.9971216201782227, + "sampling/importance_sampling_ratio/mean": 0.9999606013298035, + "sampling/importance_sampling_ratio/min": 0.3398262977600098, + "sampling/sampling_logp_difference/max": 1.1382944583892822, + "sampling/sampling_logp_difference/mean": 0.014073985256254673, + "step": 3550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1537.2, + "completions/max_terminated_length": 1537.2, + "completions/mean_length": 1147.121875, + "completions/mean_terminated_length": 1147.121875, + "completions/min_length": 831.2, + "completions/min_terminated_length": 831.2, + "entropy": 0.2567810148000717, + "epoch": 4.17743830787309, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4355989694595337, + "learning_rate": 8.49285195056942e-08, + "loss": 0.0033, + "num_tokens": 486886841.0, + "reward": 0.9223958373069763, + "reward_std": 0.07777083888649941, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9223958373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16122473627328873, + "sampling/importance_sampling_ratio/max": 1.9888453960418702, + "sampling/importance_sampling_ratio/mean": 0.9999064683914185, + "sampling/importance_sampling_ratio/min": 0.31339283287525177, + "sampling/sampling_logp_difference/max": 1.2887135982513427, + "sampling/sampling_logp_difference/mean": 0.013416317850351333, + "step": 3555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1468.4, + "completions/max_terminated_length": 1468.4, + "completions/mean_length": 1161.878125, + "completions/mean_terminated_length": 1161.878125, + "completions/min_length": 911.8, + "completions/min_terminated_length": 911.8, + "entropy": 0.2630442798137665, + "epoch": 4.18331374853114, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.8106993436813354, + "learning_rate": 8.432275260479766e-08, + "loss": 0.0069, + "num_tokens": 487599474.0, + "reward": 0.8911458492279053, + "reward_std": 0.09857227653265, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8911458492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18993451595306396, + "sampling/importance_sampling_ratio/max": 1.9875691890716554, + "sampling/importance_sampling_ratio/mean": 1.0000635027885436, + "sampling/importance_sampling_ratio/min": 0.3256483495235443, + "sampling/sampling_logp_difference/max": 1.2611011028289796, + "sampling/sampling_logp_difference/mean": 0.013434172235429287, + "step": 3560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1660.2, + "completions/max_terminated_length": 1660.2, + "completions/mean_length": 1159.6125, + "completions/mean_terminated_length": 1159.6125, + "completions/min_length": 869.2, + "completions/min_terminated_length": 869.2, + "entropy": 0.257218137383461, + "epoch": 4.1891891891891895, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.4585232436656952, + "learning_rate": 8.371698570390114e-08, + "loss": 0.002, + "num_tokens": 488298678.0, + "reward": 0.8380208373069763, + "reward_std": 0.02569769471883774, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8380208373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23604546785354613, + "sampling/importance_sampling_ratio/max": 1.859304928779602, + "sampling/importance_sampling_ratio/mean": 1.000068199634552, + "sampling/importance_sampling_ratio/min": 0.3907679319381714, + "sampling/sampling_logp_difference/max": 1.0408684730529785, + "sampling/sampling_logp_difference/mean": 0.013181830570101739, + "step": 3565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1548.4, + "completions/max_terminated_length": 1548.4, + "completions/mean_length": 1177.284375, + "completions/mean_terminated_length": 1177.284375, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "entropy": 0.2626467883586884, + "epoch": 4.195064629847239, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0, + "learning_rate": 8.31112188030046e-08, + "loss": 0.0047, + "num_tokens": 488978465.0, + "reward": 0.8786458373069763, + "reward_std": 0.09802740439772606, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8786458373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2111774556338787, + "sampling/importance_sampling_ratio/max": 1.8543755054473876, + "sampling/importance_sampling_ratio/mean": 0.999963641166687, + "sampling/importance_sampling_ratio/min": 0.4706719875335693, + "sampling/sampling_logp_difference/max": 0.8755661010742187, + "sampling/sampling_logp_difference/mean": 0.013379019685089588, + "step": 3570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1575.2, + "completions/max_terminated_length": 1575.2, + "completions/mean_length": 1152.871875, + "completions/mean_terminated_length": 1152.871875, + "completions/min_length": 889.2, + "completions/min_terminated_length": 889.2, + "entropy": 0.26695799827575684, + "epoch": 4.200940070505288, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4237537682056427, + "learning_rate": 8.250545190210805e-08, + "loss": 0.0024, + "num_tokens": 489668328.0, + "reward": 0.8729166746139526, + "reward_std": 0.05412452816963196, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8729166746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20219584852457045, + "sampling/importance_sampling_ratio/max": 1.9795571327209474, + "sampling/importance_sampling_ratio/mean": 1.0000197649002076, + "sampling/importance_sampling_ratio/min": 0.3312128663063049, + "sampling/sampling_logp_difference/max": 1.2004522442817689, + "sampling/sampling_logp_difference/mean": 0.01357947289943695, + "step": 3575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1690.4, + "completions/max_terminated_length": 1690.4, + "completions/mean_length": 1177.5125, + "completions/mean_terminated_length": 1177.5125, + "completions/min_length": 875.4, + "completions/min_terminated_length": 875.4, + "entropy": 0.25332061350345614, + "epoch": 4.206815511163337, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6305985450744629, + "learning_rate": 8.189968500121153e-08, + "loss": 0.0012, + "num_tokens": 490368364.0, + "reward": 0.8390104293823242, + "reward_std": 0.09466763809323311, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8390104293823242, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2736502140760422, + "sampling/importance_sampling_ratio/max": 1.9991877317428588, + "sampling/importance_sampling_ratio/mean": 0.9999727964401245, + "sampling/importance_sampling_ratio/min": 0.32218090295791624, + "sampling/sampling_logp_difference/max": 1.1511583924293518, + "sampling/sampling_logp_difference/mean": 0.01315567884594202, + "step": 3580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1541.0, + "completions/max_terminated_length": 1541.0, + "completions/mean_length": 1184.74375, + "completions/mean_terminated_length": 1184.74375, + "completions/min_length": 935.2, + "completions/min_terminated_length": 935.2, + "entropy": 0.24916426241397857, + "epoch": 4.212690951821386, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.37277522683143616, + "learning_rate": 8.129391810031499e-08, + "loss": 0.0012, + "num_tokens": 491056298.0, + "reward": 0.9276041746139526, + "reward_std": 0.06645566001534461, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9276041746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.15502337217330933, + "sampling/importance_sampling_ratio/max": 1.9811736583709716, + "sampling/importance_sampling_ratio/mean": 1.000077986717224, + "sampling/importance_sampling_ratio/min": 0.32719268798828127, + "sampling/sampling_logp_difference/max": 1.180593204498291, + "sampling/sampling_logp_difference/mean": 0.012911760807037353, + "step": 3585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1603.0, + "completions/max_terminated_length": 1603.0, + "completions/mean_length": 1200.834375, + "completions/mean_terminated_length": 1200.834375, + "completions/min_length": 877.4, + "completions/min_terminated_length": 877.4, + "entropy": 0.2584104537963867, + "epoch": 4.218566392479436, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5621435642242432, + "learning_rate": 8.068815119941847e-08, + "loss": 0.0027, + "num_tokens": 491756277.0, + "reward": 0.866406261920929, + "reward_std": 0.07562874779105186, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.866406261920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16670996248722075, + "sampling/importance_sampling_ratio/max": 1.9320027589797975, + "sampling/importance_sampling_ratio/mean": 1.000148606300354, + "sampling/importance_sampling_ratio/min": 0.2562232553958893, + "sampling/sampling_logp_difference/max": 1.4149065494537354, + "sampling/sampling_logp_difference/mean": 0.01321282796561718, + "step": 3590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1655.8, + "completions/max_terminated_length": 1655.8, + "completions/mean_length": 1167.93125, + "completions/mean_terminated_length": 1167.93125, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.24021802842617035, + "epoch": 4.224441833137485, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.46955856680870056, + "learning_rate": 8.008238429852192e-08, + "loss": -0.0029, + "num_tokens": 492476799.0, + "reward": 0.7401562571525574, + "reward_std": 0.06587436497211456, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7401562571525574, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2895350486040115, + "sampling/importance_sampling_ratio/max": 1.9865763902664184, + "sampling/importance_sampling_ratio/mean": 1.0000853419303894, + "sampling/importance_sampling_ratio/min": 0.3269083648920059, + "sampling/sampling_logp_difference/max": 1.1674981117248535, + "sampling/sampling_logp_difference/mean": 0.01268553752452135, + "step": 3595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1734.8, + "completions/max_terminated_length": 1734.8, + "completions/mean_length": 1220.34375, + "completions/mean_terminated_length": 1220.34375, + "completions/min_length": 886.4, + "completions/min_terminated_length": 886.4, + "entropy": 0.25528871417045595, + "epoch": 4.230317273795535, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.8392326235771179, + "learning_rate": 7.947661739762538e-08, + "loss": 0.0063, + "num_tokens": 493210509.0, + "reward": 0.7765625238418579, + "reward_std": 0.11853159815073014, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7765625238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2417447656393051, + "sampling/importance_sampling_ratio/max": 1.9319365262985229, + "sampling/importance_sampling_ratio/mean": 0.9999574422836304, + "sampling/importance_sampling_ratio/min": 0.38794071674346925, + "sampling/sampling_logp_difference/max": 1.0534749269485473, + "sampling/sampling_logp_difference/mean": 0.01314362119883299, + "step": 3600 + }, + { + "epoch": 4.230317273795535, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1550.0, + "eval_completions/max_terminated_length": 1550.0, + "eval_completions/mean_length": 1137.6325, + "eval_completions/mean_terminated_length": 1137.6325, + "eval_completions/min_length": 865.32, + "eval_completions/min_terminated_length": 865.32, + "eval_entropy": 0.25953981578350066, + "eval_frac_reward_zero_std": 0.63, + "eval_loss": 0.0015070197405293584, + "eval_num_tokens": 493210509.0, + "eval_reward": 0.7727812600135803, + "eval_reward_std": 0.07691708654165268, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7727812600135803, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29146748542785644, + "eval_runtime": 426.6542, + "eval_samples_per_second": 0.234, + "eval_sampling/importance_sampling_ratio/max": 1.9408579063415528, + "eval_sampling/importance_sampling_ratio/mean": 1.0000195336341857, + "eval_sampling/importance_sampling_ratio/min": 0.315677208006382, + "eval_sampling/sampling_logp_difference/max": 1.3441522383689881, + "eval_sampling/sampling_logp_difference/mean": 0.01335240513086319, + "eval_steps_per_second": 0.005, + "step": 3600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1525.4, + "completions/max_terminated_length": 1525.4, + "completions/mean_length": 1141.89375, + "completions/mean_terminated_length": 1141.89375, + "completions/min_length": 858.8, + "completions/min_terminated_length": 858.8, + "entropy": 0.2490395724773407, + "epoch": 4.236192714453584, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6798803210258484, + "learning_rate": 7.887085049672886e-08, + "loss": 0.0002, + "num_tokens": 493887211.0, + "reward": 0.8026041746139526, + "reward_std": 0.06693809628486633, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8026041746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2717654824256897, + "sampling/importance_sampling_ratio/max": 1.9659399032592773, + "sampling/importance_sampling_ratio/mean": 1.000072717666626, + "sampling/importance_sampling_ratio/min": 0.3628074645996094, + "sampling/sampling_logp_difference/max": 1.1372278690338136, + "sampling/sampling_logp_difference/mean": 0.012980341352522374, + "step": 3605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1651.6, + "completions/max_terminated_length": 1651.6, + "completions/mean_length": 1181.6375, + "completions/mean_terminated_length": 1181.6375, + "completions/min_length": 887.4, + "completions/min_terminated_length": 887.4, + "entropy": 0.250617590546608, + "epoch": 4.2420681551116335, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5288187861442566, + "learning_rate": 7.826508359583232e-08, + "loss": 0.0018, + "num_tokens": 494578743.0, + "reward": 0.8558333516120911, + "reward_std": 0.08396902829408645, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8558333516120911, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1968195393681526, + "sampling/importance_sampling_ratio/max": 1.8700719356536866, + "sampling/importance_sampling_ratio/mean": 0.999861991405487, + "sampling/importance_sampling_ratio/min": 0.3520336002111435, + "sampling/sampling_logp_difference/max": 1.1078625679016114, + "sampling/sampling_logp_difference/mean": 0.013078899681568145, + "step": 3610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1537.2, + "completions/max_terminated_length": 1537.2, + "completions/mean_length": 1147.86875, + "completions/mean_terminated_length": 1147.86875, + "completions/min_length": 904.4, + "completions/min_terminated_length": 904.4, + "entropy": 0.25358888506889343, + "epoch": 4.247943595769683, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.7324259281158447, + "learning_rate": 7.76593166949358e-08, + "loss": -0.0012, + "num_tokens": 495258845.0, + "reward": 0.8223958373069763, + "reward_std": 0.06487023383378983, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8223958373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27769546806812284, + "sampling/importance_sampling_ratio/max": 1.9683610439300536, + "sampling/importance_sampling_ratio/mean": 0.9999585747718811, + "sampling/importance_sampling_ratio/min": 0.36560204029083254, + "sampling/sampling_logp_difference/max": 1.239591932296753, + "sampling/sampling_logp_difference/mean": 0.01286852192133665, + "step": 3615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1655.6, + "completions/max_terminated_length": 1655.6, + "completions/mean_length": 1202.1, + "completions/mean_terminated_length": 1202.1, + "completions/min_length": 888.2, + "completions/min_terminated_length": 888.2, + "entropy": 0.26086442470550536, + "epoch": 4.253819036427732, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.38992178440093994, + "learning_rate": 7.705354979403925e-08, + "loss": -0.0012, + "num_tokens": 495961789.0, + "reward": 0.8336458563804626, + "reward_std": 0.06623862236738205, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8336458563804626, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25836754143238067, + "sampling/importance_sampling_ratio/max": 1.9335944414138795, + "sampling/importance_sampling_ratio/mean": 1.0000713109970092, + "sampling/importance_sampling_ratio/min": 0.3970839321613312, + "sampling/sampling_logp_difference/max": 0.9824123978614807, + "sampling/sampling_logp_difference/mean": 0.013319226913154125, + "step": 3620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1806.0, + "completions/max_terminated_length": 1806.0, + "completions/mean_length": 1218.45625, + "completions/mean_terminated_length": 1218.45625, + "completions/min_length": 847.2, + "completions/min_terminated_length": 847.2, + "entropy": 0.2762126445770264, + "epoch": 4.259694477085781, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.7492891550064087, + "learning_rate": 7.644778289314271e-08, + "loss": 0.001, + "num_tokens": 496669871.0, + "reward": 0.744531261920929, + "reward_std": 0.11009465903043747, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.744531261920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.34771095514297484, + "sampling/importance_sampling_ratio/max": 1.9903298139572143, + "sampling/importance_sampling_ratio/mean": 1.0000926017761231, + "sampling/importance_sampling_ratio/min": 0.3699575960636139, + "sampling/sampling_logp_difference/max": 1.0518778800964355, + "sampling/sampling_logp_difference/mean": 0.014002586342394352, + "step": 3625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1431.6, + "completions/max_terminated_length": 1431.6, + "completions/mean_length": 1137.309375, + "completions/mean_terminated_length": 1137.309375, + "completions/min_length": 898.8, + "completions/min_terminated_length": 898.8, + "entropy": 0.23710068166255951, + "epoch": 4.26556991774383, + "frac_reward_zero_std": 0.65, + "grad_norm": 1.1419117450714111, + "learning_rate": 7.584201599224618e-08, + "loss": 0.003, + "num_tokens": 497322610.0, + "reward": 0.9127604246139527, + "reward_std": 0.08629101514816284, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9127604246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16239862740039826, + "sampling/importance_sampling_ratio/max": 1.997536063194275, + "sampling/importance_sampling_ratio/mean": 1.0000044703483582, + "sampling/importance_sampling_ratio/min": 0.3823740020394325, + "sampling/sampling_logp_difference/max": 1.238282561302185, + "sampling/sampling_logp_difference/mean": 0.012295803800225259, + "step": 3630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1417.4, + "completions/max_terminated_length": 1417.4, + "completions/mean_length": 1097.246875, + "completions/mean_terminated_length": 1097.246875, + "completions/min_length": 780.8, + "completions/min_terminated_length": 780.8, + "entropy": 0.24801380932331085, + "epoch": 4.2714453584018806, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6945148706436157, + "learning_rate": 7.523624909134965e-08, + "loss": -0.002, + "num_tokens": 497989201.0, + "reward": 0.8451562523841858, + "reward_std": 0.07065275609493256, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8451562523841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2394854724407196, + "sampling/importance_sampling_ratio/max": 1.8666242361068726, + "sampling/importance_sampling_ratio/mean": 1.0000623703002929, + "sampling/importance_sampling_ratio/min": 0.357879763841629, + "sampling/sampling_logp_difference/max": 1.413445281982422, + "sampling/sampling_logp_difference/mean": 0.012836653739213943, + "step": 3635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 1744.6, + "completions/max_terminated_length": 1741.6, + "completions/mean_length": 1198.328125, + "completions/mean_terminated_length": 1182.7527099609374, + "completions/min_length": 860.8, + "completions/min_terminated_length": 860.8, + "entropy": 0.2554967701435089, + "epoch": 4.27732079905993, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 7.463048219045311e-08, + "loss": -0.009, + "num_tokens": 498713450.0, + "reward": 0.6994791865348816, + "reward_std": 0.06559417322278023, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.6994791865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3261187255382538, + "sampling/importance_sampling_ratio/max": 1.915812611579895, + "sampling/importance_sampling_ratio/mean": 1.0000962734222412, + "sampling/importance_sampling_ratio/min": 0.37440991401672363, + "sampling/sampling_logp_difference/max": 1.04357990026474, + "sampling/sampling_logp_difference/mean": 0.013260076381266117, + "step": 3640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1707.8, + "completions/max_terminated_length": 1707.8, + "completions/mean_length": 1165.36875, + "completions/mean_terminated_length": 1165.36875, + "completions/min_length": 819.6, + "completions/min_terminated_length": 819.6, + "entropy": 0.2641367554664612, + "epoch": 4.283196239717979, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5784282684326172, + "learning_rate": 7.402471528955657e-08, + "loss": 0.004, + "num_tokens": 499375088.0, + "reward": 0.8260937690734863, + "reward_std": 0.06488855034112931, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8260937690734863, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2198262929916382, + "sampling/importance_sampling_ratio/max": 1.9456058263778686, + "sampling/importance_sampling_ratio/mean": 1.0000360012054443, + "sampling/importance_sampling_ratio/min": 0.42152239084243776, + "sampling/sampling_logp_difference/max": 0.9165781736373901, + "sampling/sampling_logp_difference/mean": 0.013488280028104782, + "step": 3645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1782.0, + "completions/max_terminated_length": 1782.0, + "completions/mean_length": 1211.425, + "completions/mean_terminated_length": 1211.425, + "completions/min_length": 864.4, + "completions/min_terminated_length": 864.4, + "entropy": 0.261778736114502, + "epoch": 4.289071680376028, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5044363141059875, + "learning_rate": 7.341894838866005e-08, + "loss": -0.0007, + "num_tokens": 500075528.0, + "reward": 0.8682291865348816, + "reward_std": 0.10688243508338928, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8682291865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25191566050052644, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001546621322632, + "sampling/importance_sampling_ratio/min": 0.3008132725954056, + "sampling/sampling_logp_difference/max": 1.3812680006027223, + "sampling/sampling_logp_difference/mean": 0.013221434317529202, + "step": 3650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1432.8, + "completions/max_terminated_length": 1432.8, + "completions/mean_length": 1110.65, + "completions/mean_terminated_length": 1110.65, + "completions/min_length": 852.6, + "completions/min_terminated_length": 852.6, + "entropy": 0.2591015428304672, + "epoch": 4.2949471210340775, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6536259055137634, + "learning_rate": 7.281318148776351e-08, + "loss": 0.0039, + "num_tokens": 500752504.0, + "reward": 0.9790624976158142, + "reward_std": 0.048888879269361495, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9790624976158142, + "rewards/e2e_recall_precision_mixed_reward/std": 0.06051587462425232, + "sampling/importance_sampling_ratio/max": 1.9475368022918702, + "sampling/importance_sampling_ratio/mean": 1.0000488758087158, + "sampling/importance_sampling_ratio/min": 0.32482802011072637, + "sampling/sampling_logp_difference/max": 1.5002553701400756, + "sampling/sampling_logp_difference/mean": 0.013194483146071434, + "step": 3655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1588.0, + "completions/max_terminated_length": 1588.0, + "completions/mean_length": 1174.31875, + "completions/mean_terminated_length": 1174.31875, + "completions/min_length": 860.4, + "completions/min_terminated_length": 860.4, + "entropy": 0.24145943224430083, + "epoch": 4.300822561692127, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.3969491720199585, + "learning_rate": 7.220741458686696e-08, + "loss": 0.0052, + "num_tokens": 501458510.0, + "reward": 0.8920312762260437, + "reward_std": 0.053078722581267355, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8920312881469726, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1960427224636078, + "sampling/importance_sampling_ratio/max": 1.921673846244812, + "sampling/importance_sampling_ratio/mean": 1.0000031232833861, + "sampling/importance_sampling_ratio/min": 0.39288265705108644, + "sampling/sampling_logp_difference/max": 1.1308459281921386, + "sampling/sampling_logp_difference/mean": 0.012670677155256271, + "step": 3660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1531.8, + "completions/max_terminated_length": 1531.8, + "completions/mean_length": 1174.265625, + "completions/mean_terminated_length": 1174.265625, + "completions/min_length": 933.4, + "completions/min_terminated_length": 933.4, + "entropy": 0.2668359637260437, + "epoch": 4.306698002350176, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.930717408657074, + "learning_rate": 7.160164768597044e-08, + "loss": 0.004, + "num_tokens": 502164675.0, + "reward": 0.8020833373069763, + "reward_std": 0.09576954618096352, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8020833373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2947723791003227, + "sampling/importance_sampling_ratio/max": 1.946074938774109, + "sampling/importance_sampling_ratio/mean": 1.000030016899109, + "sampling/importance_sampling_ratio/min": 0.38857103884220123, + "sampling/sampling_logp_difference/max": 1.1057616233825684, + "sampling/sampling_logp_difference/mean": 0.01362884696573019, + "step": 3665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1576.2, + "completions/max_terminated_length": 1576.2, + "completions/mean_length": 1180.81875, + "completions/mean_terminated_length": 1180.81875, + "completions/min_length": 872.8, + "completions/min_terminated_length": 872.8, + "entropy": 0.25552141666412354, + "epoch": 4.312573443008225, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.585945188999176, + "learning_rate": 7.09958807850739e-08, + "loss": -0.0017, + "num_tokens": 502853849.0, + "reward": 0.9364583373069764, + "reward_std": 0.055623647570610044, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9364583373069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1332593083381653, + "sampling/importance_sampling_ratio/max": 1.8885759353637694, + "sampling/importance_sampling_ratio/mean": 1.000054121017456, + "sampling/importance_sampling_ratio/min": 0.4013862669467926, + "sampling/sampling_logp_difference/max": 0.9747562885284424, + "sampling/sampling_logp_difference/mean": 0.01286784913390875, + "step": 3670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1527.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 1141.078125, + "completions/mean_terminated_length": 1141.078125, + "completions/min_length": 849.0, + "completions/min_terminated_length": 849.0, + "entropy": 0.2603608280420303, + "epoch": 4.318448883666275, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.730247437953949, + "learning_rate": 7.039011388417738e-08, + "loss": -0.0032, + "num_tokens": 503527554.0, + "reward": 0.9072916865348816, + "reward_std": 0.06838146299123764, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9072916865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20818188190460205, + "sampling/importance_sampling_ratio/max": 1.9301481246948242, + "sampling/importance_sampling_ratio/mean": 1.000066590309143, + "sampling/importance_sampling_ratio/min": 0.24810873121023178, + "sampling/sampling_logp_difference/max": 1.7007533073425294, + "sampling/sampling_logp_difference/mean": 0.013547290675342083, + "step": 3675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1607.8, + "completions/max_terminated_length": 1607.8, + "completions/mean_length": 1213.71875, + "completions/mean_terminated_length": 1213.71875, + "completions/min_length": 994.8, + "completions/min_terminated_length": 994.8, + "entropy": 0.2658453106880188, + "epoch": 4.324324324324325, + "frac_reward_zero_std": 0.9, + "grad_norm": 0.0, + "learning_rate": 6.978434698328083e-08, + "loss": -0.0005, + "num_tokens": 504220120.0, + "reward": 0.9796875, + "reward_std": 0.016568987071514128, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9796875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.05093269646167755, + "sampling/importance_sampling_ratio/max": 1.8813456058502198, + "sampling/importance_sampling_ratio/mean": 1.0000309467315673, + "sampling/importance_sampling_ratio/min": 0.4121032416820526, + "sampling/sampling_logp_difference/max": 1.1532811164855956, + "sampling/sampling_logp_difference/mean": 0.013305356167256832, + "step": 3680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1571.8, + "completions/max_terminated_length": 1571.8, + "completions/mean_length": 1158.6625, + "completions/mean_terminated_length": 1158.6625, + "completions/min_length": 868.2, + "completions/min_terminated_length": 868.2, + "entropy": 0.25527799427509307, + "epoch": 4.330199764982374, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.37966060638427734, + "learning_rate": 6.917858008238429e-08, + "loss": 0.0045, + "num_tokens": 504920364.0, + "reward": 0.7671875059604645, + "reward_std": 0.08376320600509643, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7671875059604645, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2696637690067291, + "sampling/importance_sampling_ratio/max": 1.9269562005996703, + "sampling/importance_sampling_ratio/mean": 1.0000017642974854, + "sampling/importance_sampling_ratio/min": 0.2865142642069486, + "sampling/sampling_logp_difference/max": 4.005220603942871, + "sampling/sampling_logp_difference/mean": 0.012985915131866931, + "step": 3685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1684.8, + "completions/max_terminated_length": 1684.8, + "completions/mean_length": 1224.925, + "completions/mean_terminated_length": 1224.925, + "completions/min_length": 923.4, + "completions/min_terminated_length": 923.4, + "entropy": 0.24377625286579133, + "epoch": 4.336075205640423, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.342869371175766, + "learning_rate": 6.857281318148777e-08, + "loss": -0.0065, + "num_tokens": 505618484.0, + "reward": 0.845104169845581, + "reward_std": 0.0953491523861885, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.845104169845581, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23468240201473237, + "sampling/importance_sampling_ratio/max": 1.9759195566177368, + "sampling/importance_sampling_ratio/mean": 1.0000083088874816, + "sampling/importance_sampling_ratio/min": 0.29937623292207716, + "sampling/sampling_logp_difference/max": 1.3291741847991942, + "sampling/sampling_logp_difference/mean": 0.012640192732214927, + "step": 3690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1699.6, + "completions/max_terminated_length": 1699.6, + "completions/mean_length": 1237.003125, + "completions/mean_terminated_length": 1237.003125, + "completions/min_length": 942.0, + "completions/min_terminated_length": 942.0, + "entropy": 0.26305500864982606, + "epoch": 4.341950646298472, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7630984783172607, + "learning_rate": 6.796704628059123e-08, + "loss": -0.001, + "num_tokens": 506308533.0, + "reward": 0.7562500238418579, + "reward_std": 0.10581399351358414, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7562500238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3192788898944855, + "sampling/importance_sampling_ratio/max": 1.9842085123062134, + "sampling/importance_sampling_ratio/mean": 0.9998934268951416, + "sampling/importance_sampling_ratio/min": 0.4040143370628357, + "sampling/sampling_logp_difference/max": 1.1104382038116456, + "sampling/sampling_logp_difference/mean": 0.013264241628348827, + "step": 3695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1701.8, + "completions/max_terminated_length": 1701.8, + "completions/mean_length": 1203.19375, + "completions/mean_terminated_length": 1203.19375, + "completions/min_length": 872.6, + "completions/min_terminated_length": 872.6, + "entropy": 0.2650981694459915, + "epoch": 4.3478260869565215, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.636228621006012, + "learning_rate": 6.736127937969469e-08, + "loss": 0.009, + "num_tokens": 507004915.0, + "reward": 0.870312511920929, + "reward_std": 0.09012310206890106, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.870312511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19685865193605423, + "sampling/importance_sampling_ratio/max": 1.9754976272583007, + "sampling/importance_sampling_ratio/mean": 0.9999840497970581, + "sampling/importance_sampling_ratio/min": 0.3514511287212372, + "sampling/sampling_logp_difference/max": 1.0524065256118775, + "sampling/sampling_logp_difference/mean": 0.013504279032349586, + "step": 3700 + }, + { + "epoch": 4.3478260869565215, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1591.76, + "eval_completions/max_terminated_length": 1591.76, + "eval_completions/mean_length": 1160.758125, + "eval_completions/mean_terminated_length": 1160.758125, + "eval_completions/min_length": 884.4, + "eval_completions/min_terminated_length": 884.4, + "eval_entropy": 0.2645591682195663, + "eval_frac_reward_zero_std": 0.6, + "eval_loss": 0.0012973687844350934, + "eval_num_tokens": 507004915.0, + "eval_reward": 0.7682916808128357, + "eval_reward_std": 0.0784290412068367, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7682916808128357, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2942594337463379, + "eval_runtime": 436.9774, + "eval_samples_per_second": 0.229, + "eval_sampling/importance_sampling_ratio/max": 1.9532128953933716, + "eval_sampling/importance_sampling_ratio/mean": 1.0000083541870117, + "eval_sampling/importance_sampling_ratio/min": 0.3924976485967636, + "eval_sampling/sampling_logp_difference/max": 1.033168478012085, + "eval_sampling/sampling_logp_difference/mean": 0.013480137176811695, + "eval_steps_per_second": 0.005, + "step": 3700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1606.2, + "completions/max_terminated_length": 1606.2, + "completions/mean_length": 1227.14375, + "completions/mean_terminated_length": 1227.14375, + "completions/min_length": 927.6, + "completions/min_terminated_length": 927.6, + "entropy": 0.26960003972053526, + "epoch": 4.353701527614571, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.45175471901893616, + "learning_rate": 6.675551247879815e-08, + "loss": -0.0005, + "num_tokens": 507726785.0, + "reward": 0.8671875119209289, + "reward_std": 0.05317651703953743, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8671875119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.232213281840086, + "sampling/importance_sampling_ratio/max": 1.9744548320770263, + "sampling/importance_sampling_ratio/mean": 0.999909496307373, + "sampling/importance_sampling_ratio/min": 0.37495740652084353, + "sampling/sampling_logp_difference/max": 1.0028222799301147, + "sampling/sampling_logp_difference/mean": 0.013552324287593365, + "step": 3705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1602.8, + "completions/max_terminated_length": 1602.8, + "completions/mean_length": 1243.771875, + "completions/mean_terminated_length": 1243.771875, + "completions/min_length": 975.8, + "completions/min_terminated_length": 975.8, + "entropy": 0.26458646953105924, + "epoch": 4.35957696827262, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.0, + "learning_rate": 6.614974557790162e-08, + "loss": -0.0037, + "num_tokens": 508442760.0, + "reward": 0.8958333492279053, + "reward_std": 0.05056643486022949, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8958333492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17827844023704528, + "sampling/importance_sampling_ratio/max": 1.9635953903198242, + "sampling/importance_sampling_ratio/mean": 1.0000174283981322, + "sampling/importance_sampling_ratio/min": 0.3145808935165405, + "sampling/sampling_logp_difference/max": 1.2495264530181884, + "sampling/sampling_logp_difference/mean": 0.013446901552379131, + "step": 3710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1497.2, + "completions/max_terminated_length": 1497.2, + "completions/mean_length": 1114.21875, + "completions/mean_terminated_length": 1114.21875, + "completions/min_length": 796.2, + "completions/min_terminated_length": 796.2, + "entropy": 0.24198752343654634, + "epoch": 4.36545240893067, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5798568725585938, + "learning_rate": 6.554397867700509e-08, + "loss": 0.0015, + "num_tokens": 509124622.0, + "reward": 0.8671875119209289, + "reward_std": 0.09523374810814858, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8671875119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2366887092590332, + "sampling/importance_sampling_ratio/max": 1.9578262090682983, + "sampling/importance_sampling_ratio/mean": 0.9999819397926331, + "sampling/importance_sampling_ratio/min": 0.35864190459251405, + "sampling/sampling_logp_difference/max": 1.0817768573760986, + "sampling/sampling_logp_difference/mean": 0.012588843144476414, + "step": 3715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1620.8, + "completions/max_terminated_length": 1620.8, + "completions/mean_length": 1219.38125, + "completions/mean_terminated_length": 1219.38125, + "completions/min_length": 903.4, + "completions/min_terminated_length": 903.4, + "entropy": 0.25731444656848906, + "epoch": 4.371327849588719, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0, + "learning_rate": 6.493821177610854e-08, + "loss": -0.0002, + "num_tokens": 509831912.0, + "reward": 0.8432291746139526, + "reward_std": 0.06508480310440064, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8432291746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22504698634147643, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000614643096923, + "sampling/importance_sampling_ratio/min": 0.28552534580230715, + "sampling/sampling_logp_difference/max": 1.281605863571167, + "sampling/sampling_logp_difference/mean": 0.01319657452404499, + "step": 3720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1544.8, + "completions/max_terminated_length": 1544.8, + "completions/mean_length": 1148.18125, + "completions/mean_terminated_length": 1148.18125, + "completions/min_length": 829.4, + "completions/min_terminated_length": 829.4, + "entropy": 0.2571199804544449, + "epoch": 4.377203290246769, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.48561370372772217, + "learning_rate": 6.433244487521202e-08, + "loss": -0.001, + "num_tokens": 510500562.0, + "reward": 0.9138020873069763, + "reward_std": 0.05333668440580368, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9138020873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17001340240240098, + "sampling/importance_sampling_ratio/max": 1.9414838314056397, + "sampling/importance_sampling_ratio/mean": 1.0000270128250122, + "sampling/importance_sampling_ratio/min": 0.41579994559288025, + "sampling/sampling_logp_difference/max": 0.9531768798828125, + "sampling/sampling_logp_difference/mean": 0.012977257929742336, + "step": 3725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1919.6, + "completions/max_terminated_length": 1919.6, + "completions/mean_length": 1261.775, + "completions/mean_terminated_length": 1261.775, + "completions/min_length": 897.4, + "completions/min_terminated_length": 897.4, + "entropy": 0.2818825155496597, + "epoch": 4.383078730904818, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5734153389930725, + "learning_rate": 6.372667797431548e-08, + "loss": -0.0026, + "num_tokens": 511228778.0, + "reward": 0.8471354365348815, + "reward_std": 0.0608148567378521, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8471354365348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23871059119701385, + "sampling/importance_sampling_ratio/max": 1.9525668382644654, + "sampling/importance_sampling_ratio/mean": 1.0000926733016968, + "sampling/importance_sampling_ratio/min": 0.37156358957290647, + "sampling/sampling_logp_difference/max": 1.0653681755065918, + "sampling/sampling_logp_difference/mean": 0.01405724585056305, + "step": 3730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1603.4, + "completions/max_terminated_length": 1603.4, + "completions/mean_length": 1199.4875, + "completions/mean_terminated_length": 1199.4875, + "completions/min_length": 797.8, + "completions/min_terminated_length": 797.8, + "entropy": 0.2717233240604401, + "epoch": 4.388954171562867, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.44968000054359436, + "learning_rate": 6.312091107341894e-08, + "loss": 0.0036, + "num_tokens": 511913702.0, + "reward": 0.9526041746139526, + "reward_std": 0.07319597527384758, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9526041746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13536526411771774, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999484062194824, + "sampling/importance_sampling_ratio/min": 0.24164845794462195, + "sampling/sampling_logp_difference/max": 6.622607588768005, + "sampling/sampling_logp_difference/mean": 0.01382777951657772, + "step": 3735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1701.0, + "completions/max_terminated_length": 1701.0, + "completions/mean_length": 1223.628125, + "completions/mean_terminated_length": 1223.628125, + "completions/min_length": 923.8, + "completions/min_terminated_length": 923.8, + "entropy": 0.25062851011753084, + "epoch": 4.394829612220916, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8033055663108826, + "learning_rate": 6.251514417252241e-08, + "loss": 0.0072, + "num_tokens": 512615423.0, + "reward": 0.8494791865348816, + "reward_std": 0.11086350381374359, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8494791865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1987324446439743, + "sampling/importance_sampling_ratio/max": 1.931475830078125, + "sampling/importance_sampling_ratio/mean": 1.0000649571418763, + "sampling/importance_sampling_ratio/min": 0.30528209507465365, + "sampling/sampling_logp_difference/max": 1.3587350845336914, + "sampling/sampling_logp_difference/mean": 0.013007241114974023, + "step": 3740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1593.2, + "completions/max_terminated_length": 1593.2, + "completions/mean_length": 1212.203125, + "completions/mean_terminated_length": 1212.203125, + "completions/min_length": 946.6, + "completions/min_terminated_length": 946.6, + "entropy": 0.24510794878005981, + "epoch": 4.4007050528789655, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.45203477144241333, + "learning_rate": 6.190937727162587e-08, + "loss": 0.0007, + "num_tokens": 513293504.0, + "reward": 0.9322916746139527, + "reward_std": 0.053071030974388124, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9322916746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16087620556354523, + "sampling/importance_sampling_ratio/max": 1.987834596633911, + "sampling/importance_sampling_ratio/mean": 0.9998769402503968, + "sampling/importance_sampling_ratio/min": 0.33060061633586885, + "sampling/sampling_logp_difference/max": 1.3648550748825072, + "sampling/sampling_logp_difference/mean": 0.012442305870354175, + "step": 3745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1676.0, + "completions/max_terminated_length": 1676.0, + "completions/mean_length": 1173.953125, + "completions/mean_terminated_length": 1173.953125, + "completions/min_length": 914.2, + "completions/min_terminated_length": 914.2, + "entropy": 0.2551372706890106, + "epoch": 4.406580493537016, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4871174991130829, + "learning_rate": 6.130361037072935e-08, + "loss": -0.0072, + "num_tokens": 513961489.0, + "reward": 0.8958333492279053, + "reward_std": 0.08676534816622734, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8958333492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20122278928756715, + "sampling/importance_sampling_ratio/max": 1.957563042640686, + "sampling/importance_sampling_ratio/mean": 1.0001355171203614, + "sampling/importance_sampling_ratio/min": 0.29272522777318954, + "sampling/sampling_logp_difference/max": 1.3090601205825805, + "sampling/sampling_logp_difference/mean": 0.013074627332389354, + "step": 3750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1644.0, + "completions/max_terminated_length": 1644.0, + "completions/mean_length": 1176.628125, + "completions/mean_terminated_length": 1176.628125, + "completions/min_length": 876.6, + "completions/min_terminated_length": 876.6, + "entropy": 0.259324786067009, + "epoch": 4.412455934195065, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6822929382324219, + "learning_rate": 6.069784346983281e-08, + "loss": 0.0024, + "num_tokens": 514653978.0, + "reward": 0.8907291769981385, + "reward_std": 0.07878585755825043, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8907291769981385, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19454094469547273, + "sampling/importance_sampling_ratio/max": 1.972559380531311, + "sampling/importance_sampling_ratio/mean": 1.000023365020752, + "sampling/importance_sampling_ratio/min": 0.19456153102219104, + "sampling/sampling_logp_difference/max": 2.361952781677246, + "sampling/sampling_logp_difference/mean": 0.01318011675029993, + "step": 3755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1492.8, + "completions/max_terminated_length": 1492.8, + "completions/mean_length": 1159.703125, + "completions/mean_terminated_length": 1159.703125, + "completions/min_length": 855.6, + "completions/min_terminated_length": 855.6, + "entropy": 0.2610477864742279, + "epoch": 4.418331374853114, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6041929125785828, + "learning_rate": 6.009207656893627e-08, + "loss": 0.0042, + "num_tokens": 515357595.0, + "reward": 0.7888021111488343, + "reward_std": 0.09884281009435654, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7888021111488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.31391807496547697, + "sampling/importance_sampling_ratio/max": 1.9285221576690674, + "sampling/importance_sampling_ratio/mean": 1.0000345706939697, + "sampling/importance_sampling_ratio/min": 0.4189653038978577, + "sampling/sampling_logp_difference/max": 0.9033971548080444, + "sampling/sampling_logp_difference/mean": 0.013369284197688102, + "step": 3760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1598.0, + "completions/max_terminated_length": 1598.0, + "completions/mean_length": 1193.55625, + "completions/mean_terminated_length": 1193.55625, + "completions/min_length": 891.8, + "completions/min_terminated_length": 891.8, + "entropy": 0.25302115380764006, + "epoch": 4.424206815511163, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.7945708632469177, + "learning_rate": 5.9486309668039735e-08, + "loss": 0.0011, + "num_tokens": 516055325.0, + "reward": 0.7338541746139526, + "reward_std": 0.15235694646835327, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7338541746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3311033874750137, + "sampling/importance_sampling_ratio/max": 1.8949079275131226, + "sampling/importance_sampling_ratio/mean": 0.9999711632728576, + "sampling/importance_sampling_ratio/min": 0.3348624587059021, + "sampling/sampling_logp_difference/max": 1.168335199356079, + "sampling/sampling_logp_difference/mean": 0.01294925194233656, + "step": 3765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1564.2, + "completions/max_terminated_length": 1564.2, + "completions/mean_length": 1203.6125, + "completions/mean_terminated_length": 1203.6125, + "completions/min_length": 921.6, + "completions/min_terminated_length": 921.6, + "entropy": 0.25160735845565796, + "epoch": 4.430082256169213, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4195053279399872, + "learning_rate": 5.8880542767143204e-08, + "loss": -0.0016, + "num_tokens": 516760993.0, + "reward": 0.8256250143051147, + "reward_std": 0.05763051435351372, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8256250023841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24419592916965485, + "sampling/importance_sampling_ratio/max": 1.9390623331069947, + "sampling/importance_sampling_ratio/mean": 1.0000794172286986, + "sampling/importance_sampling_ratio/min": 0.40482619404792786, + "sampling/sampling_logp_difference/max": 0.967167592048645, + "sampling/sampling_logp_difference/mean": 0.01277566682547331, + "step": 3770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1495.4, + "completions/max_terminated_length": 1495.4, + "completions/mean_length": 1146.275, + "completions/mean_terminated_length": 1146.275, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "entropy": 0.2464199036359787, + "epoch": 4.435957696827262, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4899135231971741, + "learning_rate": 5.827477586624666e-08, + "loss": 0.0013, + "num_tokens": 517446233.0, + "reward": 0.8338541746139526, + "reward_std": 0.062453911453485486, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8338541746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.252014647424221, + "sampling/importance_sampling_ratio/max": 1.9053905963897706, + "sampling/importance_sampling_ratio/mean": 1.0000820279121398, + "sampling/importance_sampling_ratio/min": 0.3842323422431946, + "sampling/sampling_logp_difference/max": 0.9629977226257325, + "sampling/sampling_logp_difference/mean": 0.01282045040279627, + "step": 3775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1525.8, + "completions/max_terminated_length": 1525.8, + "completions/mean_length": 1221.778125, + "completions/mean_terminated_length": 1221.778125, + "completions/min_length": 925.4, + "completions/min_terminated_length": 925.4, + "entropy": 0.25722981095314024, + "epoch": 4.441833137485311, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.7961036562919617, + "learning_rate": 5.766900896535013e-08, + "loss": 0.0022, + "num_tokens": 518156594.0, + "reward": 0.7723958492279053, + "reward_std": 0.09959375858306885, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7723958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22938498258590698, + "sampling/importance_sampling_ratio/max": 1.9257365465164185, + "sampling/importance_sampling_ratio/mean": 0.9999468803405762, + "sampling/importance_sampling_ratio/min": 0.3357783600687981, + "sampling/sampling_logp_difference/max": 1.324942374229431, + "sampling/sampling_logp_difference/mean": 0.012965224497020245, + "step": 3780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1477.6, + "completions/max_terminated_length": 1477.6, + "completions/mean_length": 1133.83125, + "completions/mean_terminated_length": 1133.83125, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "entropy": 0.2477620154619217, + "epoch": 4.447708578143361, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.5965957045555115, + "learning_rate": 5.706324206445359e-08, + "loss": -0.0029, + "num_tokens": 518821772.0, + "reward": 0.917187511920929, + "reward_std": 0.031526891887187956, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.917187511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13625341057777404, + "sampling/importance_sampling_ratio/max": 1.9273170948028564, + "sampling/importance_sampling_ratio/mean": 1.0000198364257813, + "sampling/importance_sampling_ratio/min": 0.31869285106658934, + "sampling/sampling_logp_difference/max": 1.2890705108642577, + "sampling/sampling_logp_difference/mean": 0.012815793789923192, + "step": 3785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1529.0, + "completions/max_terminated_length": 1529.0, + "completions/mean_length": 1149.540625, + "completions/mean_terminated_length": 1149.540625, + "completions/min_length": 882.2, + "completions/min_terminated_length": 882.2, + "entropy": 0.2463329553604126, + "epoch": 4.45358401880141, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 5.645747516355706e-08, + "loss": 0.0083, + "num_tokens": 519511353.0, + "reward": 0.9010416746139527, + "reward_std": 0.0561327800154686, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9010416746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17636755406856536, + "sampling/importance_sampling_ratio/max": 1.998112392425537, + "sampling/importance_sampling_ratio/mean": 0.9999524116516113, + "sampling/importance_sampling_ratio/min": 0.26993586095049976, + "sampling/sampling_logp_difference/max": 1.9990779876708984, + "sampling/sampling_logp_difference/mean": 0.012809686362743378, + "step": 3790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1756.6, + "completions/max_terminated_length": 1756.6, + "completions/mean_length": 1267.45625, + "completions/mean_terminated_length": 1267.45625, + "completions/min_length": 894.4, + "completions/min_terminated_length": 894.4, + "entropy": 0.26598574221134186, + "epoch": 4.45945945945946, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.4366239011287689, + "learning_rate": 5.5851708262660525e-08, + "loss": -0.004, + "num_tokens": 520282155.0, + "reward": 0.7437500119209289, + "reward_std": 0.06342579536139965, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7437500119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.29738129377365113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.000070607662201, + "sampling/importance_sampling_ratio/min": 0.20193365388549864, + "sampling/sampling_logp_difference/max": 2.41158607006073, + "sampling/sampling_logp_difference/mean": 0.013761808723211288, + "step": 3795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1707.8, + "completions/max_terminated_length": 1707.8, + "completions/mean_length": 1245.2125, + "completions/mean_terminated_length": 1245.2125, + "completions/min_length": 898.0, + "completions/min_terminated_length": 898.0, + "entropy": 0.2679412066936493, + "epoch": 4.465334900117509, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6861710548400879, + "learning_rate": 5.524594136176399e-08, + "loss": 0.002, + "num_tokens": 521014463.0, + "reward": 0.8529687523841858, + "reward_std": 0.0945423498749733, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8529687523841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21668245792388915, + "sampling/importance_sampling_ratio/max": 1.9160140991210937, + "sampling/importance_sampling_ratio/mean": 1.0001230716705323, + "sampling/importance_sampling_ratio/min": 0.3523849457502365, + "sampling/sampling_logp_difference/max": 1.2402945041656495, + "sampling/sampling_logp_difference/mean": 0.013412072695791722, + "step": 3800 + }, + { + "epoch": 4.465334900117509, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1611.96, + "eval_completions/max_terminated_length": 1611.96, + "eval_completions/mean_length": 1146.22375, + "eval_completions/mean_terminated_length": 1146.22375, + "eval_completions/min_length": 863.76, + "eval_completions/min_terminated_length": 863.76, + "eval_entropy": 0.262061927318573, + "eval_frac_reward_zero_std": 0.63, + "eval_loss": 0.0004021058266516775, + "eval_num_tokens": 521014463.0, + "eval_reward": 0.77735417842865, + "eval_reward_std": 0.06804241336882115, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.77735417842865, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2873810449242592, + "eval_runtime": 437.7798, + "eval_samples_per_second": 0.228, + "eval_sampling/importance_sampling_ratio/max": 1.9648003053665162, + "eval_sampling/importance_sampling_ratio/mean": 0.9999938631057739, + "eval_sampling/importance_sampling_ratio/min": 0.321398678869009, + "eval_sampling/sampling_logp_difference/max": 1.3109819889068604, + "eval_sampling/sampling_logp_difference/mean": 0.013326054327189923, + "eval_steps_per_second": 0.005, + "step": 3800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1800.6, + "completions/max_terminated_length": 1800.6, + "completions/mean_length": 1244.31875, + "completions/mean_terminated_length": 1244.31875, + "completions/min_length": 900.0, + "completions/min_terminated_length": 900.0, + "entropy": 0.2523723661899567, + "epoch": 4.471210340775558, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.6707119345664978, + "learning_rate": 5.464017446086745e-08, + "loss": 0.0054, + "num_tokens": 521734005.0, + "reward": 0.8927083492279053, + "reward_std": 0.10553528666496277, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8927083492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19338307827711104, + "sampling/importance_sampling_ratio/max": 1.923611617088318, + "sampling/importance_sampling_ratio/mean": 1.000162625312805, + "sampling/importance_sampling_ratio/min": 0.27145159617066383, + "sampling/sampling_logp_difference/max": 1.490595269203186, + "sampling/sampling_logp_difference/mean": 0.012991057336330413, + "step": 3805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1444.0, + "completions/max_terminated_length": 1444.0, + "completions/mean_length": 1088.00625, + "completions/mean_terminated_length": 1088.00625, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "entropy": 0.24044516384601594, + "epoch": 4.477085781433607, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.8339589834213257, + "learning_rate": 5.403440755997092e-08, + "loss": 0.0051, + "num_tokens": 522375063.0, + "reward": 0.8260416746139526, + "reward_std": 0.06378234401345254, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8260416746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2462353974580765, + "sampling/importance_sampling_ratio/max": 1.935638427734375, + "sampling/importance_sampling_ratio/mean": 0.9999543190002441, + "sampling/importance_sampling_ratio/min": 0.3494003385305405, + "sampling/sampling_logp_difference/max": 1.260498571395874, + "sampling/sampling_logp_difference/mean": 0.012606218457221985, + "step": 3810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1463.0, + "completions/max_terminated_length": 1463.0, + "completions/mean_length": 1126.371875, + "completions/mean_terminated_length": 1126.371875, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "entropy": 0.25203177332878113, + "epoch": 4.482961222091657, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6170172691345215, + "learning_rate": 5.3428640659074383e-08, + "loss": 0.0062, + "num_tokens": 523071006.0, + "reward": 0.7230208396911622, + "reward_std": 0.055364441871643064, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7230208396911622, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3580518037080765, + "sampling/importance_sampling_ratio/max": 1.866661834716797, + "sampling/importance_sampling_ratio/mean": 1.0000461101531983, + "sampling/importance_sampling_ratio/min": 0.3747582271695137, + "sampling/sampling_logp_difference/max": 1.2039991855621337, + "sampling/sampling_logp_difference/mean": 0.01318561527878046, + "step": 3815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 1722.6, + "completions/max_terminated_length": 1710.8, + "completions/mean_length": 1216.221875, + "completions/mean_terminated_length": 1203.265234375, + "completions/min_length": 849.6, + "completions/min_terminated_length": 849.6, + "entropy": 0.25681395530700685, + "epoch": 4.488836662749706, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 5.282287375817785e-08, + "loss": -0.0054, + "num_tokens": 523788389.0, + "reward": 0.8151041746139527, + "reward_std": 0.055162250995635986, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8151041746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26677214801311494, + "sampling/importance_sampling_ratio/max": 1.8846506595611572, + "sampling/importance_sampling_ratio/mean": 0.9999499320983887, + "sampling/importance_sampling_ratio/min": 0.24901417940855025, + "sampling/sampling_logp_difference/max": 1.526833724975586, + "sampling/sampling_logp_difference/mean": 0.013307036273181439, + "step": 3820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1553.6, + "completions/max_terminated_length": 1553.6, + "completions/mean_length": 1177.953125, + "completions/mean_terminated_length": 1177.953125, + "completions/min_length": 902.4, + "completions/min_terminated_length": 902.4, + "entropy": 0.2598425537347794, + "epoch": 4.494712103407756, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.0, + "learning_rate": 5.221710685728132e-08, + "loss": -0.0017, + "num_tokens": 524476582.0, + "reward": 0.8104166746139526, + "reward_std": 0.03926374763250351, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8104166746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24043528139591216, + "sampling/importance_sampling_ratio/max": 1.9260545253753663, + "sampling/importance_sampling_ratio/mean": 1.0000295519828797, + "sampling/importance_sampling_ratio/min": 0.31765392124652864, + "sampling/sampling_logp_difference/max": 1.1738762140274048, + "sampling/sampling_logp_difference/mean": 0.012885869853198529, + "step": 3825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1537.2, + "completions/max_terminated_length": 1537.2, + "completions/mean_length": 1190.83125, + "completions/mean_terminated_length": 1190.83125, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "entropy": 0.2721462845802307, + "epoch": 4.500587544065805, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.0, + "learning_rate": 5.161133995638478e-08, + "loss": -0.0019, + "num_tokens": 525207200.0, + "reward": 0.8614583492279053, + "reward_std": 0.06717992275953293, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8614583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19824496507644654, + "sampling/importance_sampling_ratio/max": 1.9055607557296752, + "sampling/importance_sampling_ratio/mean": 0.9999924659729004, + "sampling/importance_sampling_ratio/min": 0.30736014246940613, + "sampling/sampling_logp_difference/max": 1.254793071746826, + "sampling/sampling_logp_difference/mean": 0.013739836029708385, + "step": 3830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1508.6, + "completions/max_terminated_length": 1508.6, + "completions/mean_length": 1160.54375, + "completions/mean_terminated_length": 1160.54375, + "completions/min_length": 906.4, + "completions/min_terminated_length": 906.4, + "entropy": 0.24823629558086396, + "epoch": 4.506462984723854, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.0, + "learning_rate": 5.100557305548825e-08, + "loss": -0.0014, + "num_tokens": 525878430.0, + "reward": 0.8552083373069763, + "reward_std": 0.04184042811393738, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8552083373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1641036868095398, + "sampling/importance_sampling_ratio/max": 1.9360382080078125, + "sampling/importance_sampling_ratio/mean": 1.0001091837882996, + "sampling/importance_sampling_ratio/min": 0.435736221075058, + "sampling/sampling_logp_difference/max": 0.8629006385803223, + "sampling/sampling_logp_difference/mean": 0.01278580967336893, + "step": 3835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1659.2, + "completions/max_terminated_length": 1659.2, + "completions/mean_length": 1220.75, + "completions/mean_terminated_length": 1220.75, + "completions/min_length": 908.2, + "completions/min_terminated_length": 908.2, + "entropy": 0.26801995635032655, + "epoch": 4.512338425381904, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0, + "learning_rate": 5.039980615459171e-08, + "loss": 0.0028, + "num_tokens": 526583710.0, + "reward": 0.8401041746139526, + "reward_std": 0.07929302006959915, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8401041746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22597428858280183, + "sampling/importance_sampling_ratio/max": 1.9008899450302124, + "sampling/importance_sampling_ratio/mean": 0.9999897599220275, + "sampling/importance_sampling_ratio/min": 0.36771447360515597, + "sampling/sampling_logp_difference/max": 1.1116267204284669, + "sampling/sampling_logp_difference/mean": 0.013435482792556287, + "step": 3840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1499.2, + "completions/max_terminated_length": 1499.2, + "completions/mean_length": 1125.628125, + "completions/mean_terminated_length": 1125.628125, + "completions/min_length": 837.6, + "completions/min_terminated_length": 837.6, + "entropy": 0.24567932784557342, + "epoch": 4.518213866039953, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3865836262702942, + "learning_rate": 4.979403925369518e-08, + "loss": -0.0024, + "num_tokens": 527265495.0, + "reward": 0.9781250119209289, + "reward_std": 0.039643457531929015, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9781250119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.0770312175154686, + "sampling/importance_sampling_ratio/max": 1.9671642780303955, + "sampling/importance_sampling_ratio/mean": 0.9999995350837707, + "sampling/importance_sampling_ratio/min": 0.29105359613895415, + "sampling/sampling_logp_difference/max": 1.2808002710342408, + "sampling/sampling_logp_difference/mean": 0.01274335365742445, + "step": 3845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1580.4, + "completions/max_terminated_length": 1580.4, + "completions/mean_length": 1136.375, + "completions/mean_terminated_length": 1136.375, + "completions/min_length": 814.2, + "completions/min_terminated_length": 814.2, + "entropy": 0.2531812101602554, + "epoch": 4.524089306698002, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.0, + "learning_rate": 4.918827235279864e-08, + "loss": 0.0018, + "num_tokens": 527932655.0, + "reward": 0.8760416746139527, + "reward_std": 0.03966931700706482, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8760416746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2241850331425667, + "sampling/importance_sampling_ratio/max": 1.9524729251861572, + "sampling/importance_sampling_ratio/mean": 0.9998790860176087, + "sampling/importance_sampling_ratio/min": 0.35629400610923767, + "sampling/sampling_logp_difference/max": 1.0961596488952636, + "sampling/sampling_logp_difference/mean": 0.012896392308175565, + "step": 3850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1752.4, + "completions/max_terminated_length": 1752.4, + "completions/mean_length": 1215.825, + "completions/mean_terminated_length": 1215.825, + "completions/min_length": 906.6, + "completions/min_terminated_length": 906.6, + "entropy": 0.2722884476184845, + "epoch": 4.529964747356051, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7329661846160889, + "learning_rate": 4.8582505451902106e-08, + "loss": 0.0008, + "num_tokens": 528649975.0, + "reward": 0.9338541746139526, + "reward_std": 0.06870361119508743, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9338541746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.14670785665512084, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000779271125793, + "sampling/importance_sampling_ratio/min": 0.3969003438949585, + "sampling/sampling_logp_difference/max": 1.0312414169311523, + "sampling/sampling_logp_difference/mean": 0.013571283221244812, + "step": 3855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1433.0, + "completions/max_terminated_length": 1433.0, + "completions/mean_length": 1156.028125, + "completions/mean_terminated_length": 1156.028125, + "completions/min_length": 884.6, + "completions/min_terminated_length": 884.6, + "entropy": 0.25865795016288756, + "epoch": 4.535840188014101, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.42414742708206177, + "learning_rate": 4.797673855100557e-08, + "loss": -0.0011, + "num_tokens": 529328304.0, + "reward": 0.8830729484558105, + "reward_std": 0.0488592691719532, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8830729484558105, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19041550755500794, + "sampling/importance_sampling_ratio/max": 1.9730222463607787, + "sampling/importance_sampling_ratio/mean": 1.0000560283660889, + "sampling/importance_sampling_ratio/min": 0.45640029907226565, + "sampling/sampling_logp_difference/max": 0.8613581180572509, + "sampling/sampling_logp_difference/mean": 0.012920542247593403, + "step": 3860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1729.0, + "completions/max_terminated_length": 1729.0, + "completions/mean_length": 1203.79375, + "completions/mean_terminated_length": 1203.79375, + "completions/min_length": 920.8, + "completions/min_terminated_length": 920.8, + "entropy": 0.25510145127773287, + "epoch": 4.541715628672151, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.8556481003761292, + "learning_rate": 4.737097165010904e-08, + "loss": -0.0001, + "num_tokens": 530035038.0, + "reward": 0.8848958373069763, + "reward_std": 0.050856249034404756, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8848958373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17340871691703796, + "sampling/importance_sampling_ratio/max": 1.9588409900665282, + "sampling/importance_sampling_ratio/mean": 0.9999428868293763, + "sampling/importance_sampling_ratio/min": 0.3276062309741974, + "sampling/sampling_logp_difference/max": 1.2199375867843627, + "sampling/sampling_logp_difference/mean": 0.0131689066067338, + "step": 3865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1652.8, + "completions/max_terminated_length": 1652.8, + "completions/mean_length": 1153.7875, + "completions/mean_terminated_length": 1153.7875, + "completions/min_length": 906.8, + "completions/min_terminated_length": 906.8, + "entropy": 0.2554138332605362, + "epoch": 4.5475910693302, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6005891561508179, + "learning_rate": 4.67652047492125e-08, + "loss": 0.0004, + "num_tokens": 530738186.0, + "reward": 0.7763020992279053, + "reward_std": 0.05372370630502701, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7763020992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28807159811258315, + "sampling/importance_sampling_ratio/max": 1.9729528188705445, + "sampling/importance_sampling_ratio/mean": 1.0000159978866576, + "sampling/importance_sampling_ratio/min": 0.34084552526474, + "sampling/sampling_logp_difference/max": 1.1215097904205322, + "sampling/sampling_logp_difference/mean": 0.01334170252084732, + "step": 3870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1798.8, + "completions/max_terminated_length": 1686.6, + "completions/mean_length": 1217.609375, + "completions/mean_terminated_length": 1213.7258544921874, + "completions/min_length": 912.2, + "completions/min_terminated_length": 912.2, + "entropy": 0.2649496465921402, + "epoch": 4.553466509988249, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.11833023279905319, + "learning_rate": 4.615943784831597e-08, + "loss": -0.0126, + "num_tokens": 531457721.0, + "reward": 0.8276041865348815, + "reward_std": 0.09151726067066193, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8276041865348815, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24235431849956512, + "sampling/importance_sampling_ratio/max": 1.9542683362960815, + "sampling/importance_sampling_ratio/mean": 1.0000524044036865, + "sampling/importance_sampling_ratio/min": 0.42125728726387024, + "sampling/sampling_logp_difference/max": 0.8714985370635986, + "sampling/sampling_logp_difference/mean": 0.013429945148527623, + "step": 3875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1519.8, + "completions/max_terminated_length": 1519.8, + "completions/mean_length": 1170.1625, + "completions/mean_terminated_length": 1170.1625, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "entropy": 0.2552923381328583, + "epoch": 4.559341950646298, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5462660193443298, + "learning_rate": 4.555367094741943e-08, + "loss": 0.0003, + "num_tokens": 532149821.0, + "reward": 0.8942708373069763, + "reward_std": 0.05812600329518318, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8942708373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1705754280090332, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000134825706481, + "sampling/importance_sampling_ratio/min": 0.3502050653100014, + "sampling/sampling_logp_difference/max": 1.247010850906372, + "sampling/sampling_logp_difference/mean": 0.012968187779188156, + "step": 3880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1731.6, + "completions/max_terminated_length": 1731.6, + "completions/mean_length": 1178.2, + "completions/mean_terminated_length": 1178.2, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.2585215061903, + "epoch": 4.565217391304348, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5714375376701355, + "learning_rate": 4.4947904046522897e-08, + "loss": -0.0019, + "num_tokens": 532852845.0, + "reward": 0.8404687643051147, + "reward_std": 0.06901195198297501, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8404687643051147, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24585144221782684, + "sampling/importance_sampling_ratio/max": 1.974919819831848, + "sampling/importance_sampling_ratio/mean": 0.999848234653473, + "sampling/importance_sampling_ratio/min": 0.2790116786956787, + "sampling/sampling_logp_difference/max": 1.7504661321640014, + "sampling/sampling_logp_difference/mean": 0.013239697739481925, + "step": 3885 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1500.2, + "completions/max_terminated_length": 1500.2, + "completions/mean_length": 1135.875, + "completions/mean_terminated_length": 1135.875, + "completions/min_length": 773.6, + "completions/min_terminated_length": 773.6, + "entropy": 0.2818588376045227, + "epoch": 4.571092831962397, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.4970671236515045, + "learning_rate": 4.434213714562636e-08, + "loss": -0.0024, + "num_tokens": 533555349.0, + "reward": 0.8330729246139527, + "reward_std": 0.038709495961666104, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8330729246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.30427423417568206, + "sampling/importance_sampling_ratio/max": 1.9636075735092162, + "sampling/importance_sampling_ratio/mean": 1.0000600218772888, + "sampling/importance_sampling_ratio/min": 0.4317789852619171, + "sampling/sampling_logp_difference/max": 0.952790904045105, + "sampling/sampling_logp_difference/mean": 0.014102857001125813, + "step": 3890 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1470.2, + "completions/max_terminated_length": 1470.2, + "completions/mean_length": 1135.459375, + "completions/mean_terminated_length": 1135.459375, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 0.2717915177345276, + "epoch": 4.576968272620446, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.37997737526893616, + "learning_rate": 4.373637024472983e-08, + "loss": -0.0001, + "num_tokens": 534205528.0, + "reward": 0.9541666746139527, + "reward_std": 0.03333333283662796, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9541666746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.09064806997776031, + "sampling/importance_sampling_ratio/max": 1.943871831893921, + "sampling/importance_sampling_ratio/mean": 0.9999991655349731, + "sampling/importance_sampling_ratio/min": 0.33060679733753207, + "sampling/sampling_logp_difference/max": 1.4704230308532715, + "sampling/sampling_logp_difference/mean": 0.01357163693755865, + "step": 3895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1652.8, + "completions/max_terminated_length": 1652.8, + "completions/mean_length": 1220.6125, + "completions/mean_terminated_length": 1220.6125, + "completions/min_length": 940.2, + "completions/min_terminated_length": 940.2, + "entropy": 0.26435089111328125, + "epoch": 4.582843713278496, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.9621942639350891, + "learning_rate": 4.313060334383329e-08, + "loss": 0.0041, + "num_tokens": 534929196.0, + "reward": 0.9328125238418579, + "reward_std": 0.07412366569042206, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9328125238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.15820899307727815, + "sampling/importance_sampling_ratio/max": 1.9864572286605835, + "sampling/importance_sampling_ratio/mean": 0.9999479293823242, + "sampling/importance_sampling_ratio/min": 0.278659051656725, + "sampling/sampling_logp_difference/max": 7.299841666221619, + "sampling/sampling_logp_difference/mean": 0.013311752490699292, + "step": 3900 + }, + { + "epoch": 4.582843713278496, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1600.16, + "eval_completions/max_terminated_length": 1600.16, + "eval_completions/mean_length": 1158.768125, + "eval_completions/mean_terminated_length": 1158.768125, + "eval_completions/min_length": 863.52, + "eval_completions/min_terminated_length": 863.52, + "eval_entropy": 0.2665262085199356, + "eval_frac_reward_zero_std": 0.6, + "eval_loss": 0.0024516424164175987, + "eval_num_tokens": 534929196.0, + "eval_reward": 0.7693125116825104, + "eval_reward_std": 0.07587137021124363, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7693125116825104, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29620310723781584, + "eval_runtime": 435.3245, + "eval_samples_per_second": 0.23, + "eval_sampling/importance_sampling_ratio/max": 1.935142192840576, + "eval_sampling/importance_sampling_ratio/mean": 0.9999475979804993, + "eval_sampling/importance_sampling_ratio/min": 0.30313371320943816, + "eval_sampling/sampling_logp_difference/max": 2.0260102558135986, + "eval_sampling/sampling_logp_difference/mean": 0.013469803594052792, + "eval_steps_per_second": 0.005, + "step": 3900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1640.6, + "completions/max_terminated_length": 1640.6, + "completions/mean_length": 1170.825, + "completions/mean_terminated_length": 1170.825, + "completions/min_length": 838.4, + "completions/min_terminated_length": 838.4, + "entropy": 0.24506229162216187, + "epoch": 4.5887191539365455, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7513599991798401, + "learning_rate": 4.252483644293676e-08, + "loss": -0.0017, + "num_tokens": 535622804.0, + "reward": 0.9093750238418579, + "reward_std": 0.07427767887711526, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9093750238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.15920108705759048, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0001034498214723, + "sampling/importance_sampling_ratio/min": 0.3779327243566513, + "sampling/sampling_logp_difference/max": 1.4409908533096314, + "sampling/sampling_logp_difference/mean": 0.012623549997806549, + "step": 3905 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1821.8, + "completions/max_terminated_length": 1821.8, + "completions/mean_length": 1271.396875, + "completions/mean_terminated_length": 1271.396875, + "completions/min_length": 911.2, + "completions/min_terminated_length": 911.2, + "entropy": 0.2800646245479584, + "epoch": 4.594594594594595, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6105312705039978, + "learning_rate": 4.191906954204022e-08, + "loss": 0.0054, + "num_tokens": 536353683.0, + "reward": 0.8318229198455811, + "reward_std": 0.08738780058920384, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8318229198455811, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18347244337201118, + "sampling/importance_sampling_ratio/max": 1.9310923099517823, + "sampling/importance_sampling_ratio/mean": 1.0000660181045533, + "sampling/importance_sampling_ratio/min": 0.3126604288816452, + "sampling/sampling_logp_difference/max": 1.3532617330551147, + "sampling/sampling_logp_difference/mean": 0.013997785560786725, + "step": 3910 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1372.8, + "completions/max_terminated_length": 1372.8, + "completions/mean_length": 1049.075, + "completions/mean_terminated_length": 1049.075, + "completions/min_length": 797.6, + "completions/min_terminated_length": 797.6, + "entropy": 0.24361443221569062, + "epoch": 4.600470035252644, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 4.131330264114369e-08, + "loss": -0.0028, + "num_tokens": 536989403.0, + "reward": 0.8885416746139526, + "reward_std": 0.04319274723529816, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8885416746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.11913625001907349, + "sampling/importance_sampling_ratio/max": 1.9888309478759765, + "sampling/importance_sampling_ratio/mean": 1.0000429511070252, + "sampling/importance_sampling_ratio/min": 0.3370845317840576, + "sampling/sampling_logp_difference/max": 1.1793973922729493, + "sampling/sampling_logp_difference/mean": 0.01257583238184452, + "step": 3915 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1515.2, + "completions/max_terminated_length": 1515.2, + "completions/mean_length": 1166.315625, + "completions/mean_terminated_length": 1166.315625, + "completions/min_length": 898.0, + "completions/min_terminated_length": 898.0, + "entropy": 0.2607916831970215, + "epoch": 4.606345475910693, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.4696587920188904, + "learning_rate": 4.070753574024715e-08, + "loss": -0.0037, + "num_tokens": 537690480.0, + "reward": 0.7993229150772094, + "reward_std": 0.030344261974096298, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7993229269981384, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23416321575641633, + "sampling/importance_sampling_ratio/max": 1.9028977155685425, + "sampling/importance_sampling_ratio/mean": 1.0000278234481812, + "sampling/importance_sampling_ratio/min": 0.294919428229332, + "sampling/sampling_logp_difference/max": 1.323739767074585, + "sampling/sampling_logp_difference/mean": 0.013191297091543675, + "step": 3920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1660.6, + "completions/max_terminated_length": 1660.6, + "completions/mean_length": 1238.453125, + "completions/mean_terminated_length": 1238.453125, + "completions/min_length": 955.6, + "completions/min_terminated_length": 955.6, + "entropy": 0.27204486131668093, + "epoch": 4.6122209165687424, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 4.010176883935062e-08, + "loss": 0.0012, + "num_tokens": 538432081.0, + "reward": 0.8140625119209289, + "reward_std": 0.058893933147192004, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8140625119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27546189576387403, + "sampling/importance_sampling_ratio/max": 1.9770929336547851, + "sampling/importance_sampling_ratio/mean": 1.0000423312187194, + "sampling/importance_sampling_ratio/min": 0.27728479243814946, + "sampling/sampling_logp_difference/max": 1.5920629739761352, + "sampling/sampling_logp_difference/mean": 0.013747538067400455, + "step": 3925 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1446.6, + "completions/max_terminated_length": 1446.6, + "completions/mean_length": 1153.44375, + "completions/mean_terminated_length": 1153.44375, + "completions/min_length": 893.2, + "completions/min_terminated_length": 893.2, + "entropy": 0.25421291291713716, + "epoch": 4.618096357226792, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 3.949600193845408e-08, + "loss": -0.0001, + "num_tokens": 539115407.0, + "reward": 0.9856771111488343, + "reward_std": 0.037527060508728026, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9856771111488343, + "rewards/e2e_recall_precision_mixed_reward/std": 0.06466835737228394, + "sampling/importance_sampling_ratio/max": 1.9693354606628417, + "sampling/importance_sampling_ratio/mean": 0.999970543384552, + "sampling/importance_sampling_ratio/min": 0.3985715299844742, + "sampling/sampling_logp_difference/max": 1.1402654647827148, + "sampling/sampling_logp_difference/mean": 0.013061221688985825, + "step": 3930 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1721.0, + "completions/max_terminated_length": 1721.0, + "completions/mean_length": 1277.778125, + "completions/mean_terminated_length": 1277.778125, + "completions/min_length": 917.0, + "completions/min_terminated_length": 917.0, + "entropy": 0.2775550276041031, + "epoch": 4.623971797884842, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.3847541809082031, + "learning_rate": 3.8890235037557545e-08, + "loss": 0.0022, + "num_tokens": 539880904.0, + "reward": 0.8575520992279053, + "reward_std": 0.05469306409358978, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8575520992279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1839374229311943, + "sampling/importance_sampling_ratio/max": 1.9318484306335448, + "sampling/importance_sampling_ratio/mean": 0.9999679446220398, + "sampling/importance_sampling_ratio/min": 0.2941402941942215, + "sampling/sampling_logp_difference/max": 1.2686848640441895, + "sampling/sampling_logp_difference/mean": 0.013829667121171951, + "step": 3935 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 1206.425, + "completions/mean_terminated_length": 1206.425, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "entropy": 0.27049291133880615, + "epoch": 4.629847238542891, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6516361832618713, + "learning_rate": 3.828446813666101e-08, + "loss": -0.0076, + "num_tokens": 540607264.0, + "reward": 0.7458333432674408, + "reward_std": 0.07592152431607246, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7458333432674408, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2239207148551941, + "sampling/importance_sampling_ratio/max": 1.9414830446243285, + "sampling/importance_sampling_ratio/mean": 0.9999024152755738, + "sampling/importance_sampling_ratio/min": 0.4399965822696686, + "sampling/sampling_logp_difference/max": 1.0633353471755982, + "sampling/sampling_logp_difference/mean": 0.013744635693728923, + "step": 3940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025, + "completions/max_length": 1705.6, + "completions/max_terminated_length": 1683.6, + "completions/mean_length": 1243.896875, + "completions/mean_terminated_length": 1217.3513427734374, + "completions/min_length": 937.6, + "completions/min_terminated_length": 937.6, + "entropy": 0.27136591672897337, + "epoch": 4.63572267920094, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.45293742418289185, + "learning_rate": 3.767870123576448e-08, + "loss": -0.0076, + "num_tokens": 541313663.0, + "reward": 0.737500011920929, + "reward_std": 0.07753491625189782, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.737500011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.32985628843307496, + "sampling/importance_sampling_ratio/max": 1.9891737461090089, + "sampling/importance_sampling_ratio/mean": 1.000024402141571, + "sampling/importance_sampling_ratio/min": 0.3064669918268919, + "sampling/sampling_logp_difference/max": 1.5447747945785522, + "sampling/sampling_logp_difference/mean": 0.013566815294325352, + "step": 3945 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1630.4, + "completions/max_terminated_length": 1630.4, + "completions/mean_length": 1191.609375, + "completions/mean_terminated_length": 1191.609375, + "completions/min_length": 866.6, + "completions/min_terminated_length": 866.6, + "entropy": 0.24883055090904235, + "epoch": 4.6415981198589895, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6785134077072144, + "learning_rate": 3.707293433486794e-08, + "loss": 0.0025, + "num_tokens": 542031426.0, + "reward": 0.8836979389190673, + "reward_std": 0.07515020072460174, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8836979389190673, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21068175360560418, + "sampling/importance_sampling_ratio/max": 1.997580623626709, + "sampling/importance_sampling_ratio/mean": 1.00013267993927, + "sampling/importance_sampling_ratio/min": 0.40443000197410583, + "sampling/sampling_logp_difference/max": 0.9994537353515625, + "sampling/sampling_logp_difference/mean": 0.012894240953028203, + "step": 3950 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1558.4, + "completions/max_terminated_length": 1558.4, + "completions/mean_length": 1170.86875, + "completions/mean_terminated_length": 1170.86875, + "completions/min_length": 851.2, + "completions/min_terminated_length": 851.2, + "entropy": 0.2621671468019485, + "epoch": 4.647473560517039, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6207395195960999, + "learning_rate": 3.646716743397141e-08, + "loss": 0.0014, + "num_tokens": 542759560.0, + "reward": 0.8729166865348816, + "reward_std": 0.059103918820619585, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8729166865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.19129492193460465, + "sampling/importance_sampling_ratio/max": 1.8469502925872803, + "sampling/importance_sampling_ratio/mean": 1.000000774860382, + "sampling/importance_sampling_ratio/min": 0.3592810183763504, + "sampling/sampling_logp_difference/max": 1.0624561548233031, + "sampling/sampling_logp_difference/mean": 0.01351084392517805, + "step": 3955 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1508.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 1172.725, + "completions/mean_terminated_length": 1172.725, + "completions/min_length": 919.4, + "completions/min_terminated_length": 919.4, + "entropy": 0.2633721888065338, + "epoch": 4.653349001175088, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.7144450545310974, + "learning_rate": 3.5861400533074866e-08, + "loss": -0.0002, + "num_tokens": 543453360.0, + "reward": 0.8821875095367432, + "reward_std": 0.06506949663162231, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8821875095367432, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18558357805013656, + "sampling/importance_sampling_ratio/max": 1.9161983489990235, + "sampling/importance_sampling_ratio/mean": 0.999916136264801, + "sampling/importance_sampling_ratio/min": 0.28187123835086825, + "sampling/sampling_logp_difference/max": 1.3893019914627076, + "sampling/sampling_logp_difference/mean": 0.013280938379466534, + "step": 3960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1580.8, + "completions/max_terminated_length": 1580.8, + "completions/mean_length": 1179.315625, + "completions/mean_terminated_length": 1179.315625, + "completions/min_length": 882.6, + "completions/min_terminated_length": 882.6, + "entropy": 0.2579510986804962, + "epoch": 4.659224441833137, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6650709509849548, + "learning_rate": 3.5255633632178335e-08, + "loss": 0.0012, + "num_tokens": 544191765.0, + "reward": 0.848437511920929, + "reward_std": 0.05586938187479973, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.848437511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21678120493888856, + "sampling/importance_sampling_ratio/max": 1.9823765516281129, + "sampling/importance_sampling_ratio/mean": 0.9999704122543335, + "sampling/importance_sampling_ratio/min": 0.3650485098361969, + "sampling/sampling_logp_difference/max": 1.1645691871643067, + "sampling/sampling_logp_difference/mean": 0.013468297570943833, + "step": 3965 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1900.4, + "completions/max_terminated_length": 1900.4, + "completions/mean_length": 1203.55, + "completions/mean_terminated_length": 1203.55, + "completions/min_length": 880.4, + "completions/min_terminated_length": 880.4, + "entropy": 0.24793876111507415, + "epoch": 4.6650998824911865, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5894632339477539, + "learning_rate": 3.46498667312818e-08, + "loss": 0.0004, + "num_tokens": 544872565.0, + "reward": 0.7872395992279053, + "reward_std": 0.09235174730420112, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7872396111488342, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3238639831542969, + "sampling/importance_sampling_ratio/max": 1.927569079399109, + "sampling/importance_sampling_ratio/mean": 0.9999173641204834, + "sampling/importance_sampling_ratio/min": 0.3570482492446899, + "sampling/sampling_logp_difference/max": 1.0754551410675048, + "sampling/sampling_logp_difference/mean": 0.012769071571528911, + "step": 3970 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1466.2, + "completions/max_terminated_length": 1466.2, + "completions/mean_length": 1168.040625, + "completions/mean_terminated_length": 1168.040625, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "entropy": 0.2611490249633789, + "epoch": 4.670975323149237, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 3.404409983038527e-08, + "loss": 0.0065, + "num_tokens": 545571234.0, + "reward": 0.8796875238418579, + "reward_std": 0.06576394066214561, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8796875238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2026699587702751, + "sampling/importance_sampling_ratio/max": 1.9442184925079347, + "sampling/importance_sampling_ratio/mean": 0.9999972224235535, + "sampling/importance_sampling_ratio/min": 0.37336876094341276, + "sampling/sampling_logp_difference/max": 1.0985134363174438, + "sampling/sampling_logp_difference/mean": 0.013413655199110508, + "step": 3975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1692.6, + "completions/max_terminated_length": 1692.6, + "completions/mean_length": 1231.178125, + "completions/mean_terminated_length": 1231.178125, + "completions/min_length": 876.6, + "completions/min_terminated_length": 876.6, + "entropy": 0.2676435440778732, + "epoch": 4.676850763807286, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.8372879028320312, + "learning_rate": 3.343833292948873e-08, + "loss": 0.0038, + "num_tokens": 546334107.0, + "reward": 0.8958333492279053, + "reward_std": 0.06377268582582474, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8958333492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21343012303113937, + "sampling/importance_sampling_ratio/max": 1.99753577709198, + "sampling/importance_sampling_ratio/mean": 0.999961519241333, + "sampling/importance_sampling_ratio/min": 0.31758705228567125, + "sampling/sampling_logp_difference/max": 1.2507687091827393, + "sampling/sampling_logp_difference/mean": 0.013471122644841672, + "step": 3980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1752.2, + "completions/max_terminated_length": 1752.2, + "completions/mean_length": 1251.534375, + "completions/mean_terminated_length": 1251.534375, + "completions/min_length": 935.4, + "completions/min_terminated_length": 935.4, + "entropy": 0.2761573553085327, + "epoch": 4.682726204465335, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.5521088242530823, + "learning_rate": 3.28325660285922e-08, + "loss": 0.0005, + "num_tokens": 547056854.0, + "reward": 0.9140625, + "reward_std": 0.042132827639579776, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9140625, + "rewards/e2e_recall_precision_mixed_reward/std": 0.10463754087686539, + "sampling/importance_sampling_ratio/max": 1.9920671224594115, + "sampling/importance_sampling_ratio/mean": 0.999975323677063, + "sampling/importance_sampling_ratio/min": 0.3770316272974014, + "sampling/sampling_logp_difference/max": 1.247798991203308, + "sampling/sampling_logp_difference/mean": 0.013781622983515263, + "step": 3985 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1766.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 1225.1625, + "completions/mean_terminated_length": 1225.1625, + "completions/min_length": 924.2, + "completions/min_terminated_length": 924.2, + "entropy": 0.27316820025444033, + "epoch": 4.688601645123384, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5807452201843262, + "learning_rate": 3.2226799127695656e-08, + "loss": 0.0042, + "num_tokens": 547772058.0, + "reward": 0.7897916674613953, + "reward_std": 0.05927639305591583, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7897916674613953, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26461312770843504, + "sampling/importance_sampling_ratio/max": 1.8936434745788575, + "sampling/importance_sampling_ratio/mean": 0.9999576568603515, + "sampling/importance_sampling_ratio/min": 0.3703816294670105, + "sampling/sampling_logp_difference/max": 1.2750481605529784, + "sampling/sampling_logp_difference/mean": 0.013870716467499733, + "step": 3990 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1602.6, + "completions/max_terminated_length": 1602.6, + "completions/mean_length": 1158.24375, + "completions/mean_terminated_length": 1158.24375, + "completions/min_length": 877.2, + "completions/min_terminated_length": 877.2, + "entropy": 0.255287766456604, + "epoch": 4.6944770857814335, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9467837810516357, + "learning_rate": 3.1621032226799126e-08, + "loss": 0.0016, + "num_tokens": 548446504.0, + "reward": 0.909375011920929, + "reward_std": 0.0933942548930645, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.909375011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1927630953490734, + "sampling/importance_sampling_ratio/max": 1.9999804496765137, + "sampling/importance_sampling_ratio/mean": 1.0001122236251831, + "sampling/importance_sampling_ratio/min": 0.35746287405490873, + "sampling/sampling_logp_difference/max": 1.3460246801376343, + "sampling/sampling_logp_difference/mean": 0.013273773156106472, + "step": 3995 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1541.6, + "completions/max_terminated_length": 1541.6, + "completions/mean_length": 1136.003125, + "completions/mean_terminated_length": 1136.003125, + "completions/min_length": 832.8, + "completions/min_terminated_length": 832.8, + "entropy": 0.25159532129764556, + "epoch": 4.700352526439483, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.8297567367553711, + "learning_rate": 3.101526532590259e-08, + "loss": 0.0062, + "num_tokens": 549096761.0, + "reward": 0.8660937666893005, + "reward_std": 0.10318772792816162, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8660937666893005, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22221954464912413, + "sampling/importance_sampling_ratio/max": 1.9245586156845094, + "sampling/importance_sampling_ratio/mean": 1.0000725388526917, + "sampling/importance_sampling_ratio/min": 0.32359273731708527, + "sampling/sampling_logp_difference/max": 1.1840140581130982, + "sampling/sampling_logp_difference/mean": 0.012639879994094372, + "step": 4000 + }, + { + "epoch": 4.700352526439483, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1620.08, + "eval_completions/max_terminated_length": 1620.08, + "eval_completions/mean_length": 1171.0, + "eval_completions/mean_terminated_length": 1171.0, + "eval_completions/min_length": 876.84, + "eval_completions/min_terminated_length": 876.84, + "eval_entropy": 0.2678144866228104, + "eval_frac_reward_zero_std": 0.61, + "eval_loss": 0.002559106098487973, + "eval_num_tokens": 549096761.0, + "eval_reward": 0.7812708473205566, + "eval_reward_std": 0.07859123006463051, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7812708473205566, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.2977286207675934, + "eval_runtime": 445.3753, + "eval_samples_per_second": 0.225, + "eval_sampling/importance_sampling_ratio/max": 1.9588973712921143, + "eval_sampling/importance_sampling_ratio/mean": 1.000038492679596, + "eval_sampling/importance_sampling_ratio/min": 0.37385270178318025, + "eval_sampling/sampling_logp_difference/max": 1.1236275243759155, + "eval_sampling/sampling_logp_difference/mean": 0.013486710004508495, + "eval_steps_per_second": 0.004, + "step": 4000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1669.4, + "completions/max_terminated_length": 1669.4, + "completions/mean_length": 1135.534375, + "completions/mean_terminated_length": 1135.534375, + "completions/min_length": 805.4, + "completions/min_terminated_length": 805.4, + "entropy": 0.2553311824798584, + "epoch": 4.706227967097532, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.40861353278160095, + "learning_rate": 3.040949842500606e-08, + "loss": 0.0015, + "num_tokens": 549800276.0, + "reward": 0.806250023841858, + "reward_std": 0.07501365020871162, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.806250023841858, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24988763481378556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000290989875793, + "sampling/importance_sampling_ratio/min": 0.300783509016037, + "sampling/sampling_logp_difference/max": 1.3301929354667663, + "sampling/sampling_logp_difference/mean": 0.013078136369585991, + "step": 4005 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1840.6, + "completions/max_terminated_length": 1725.8, + "completions/mean_length": 1265.096875, + "completions/mean_terminated_length": 1261.593212890625, + "completions/min_length": 888.2, + "completions/min_terminated_length": 888.2, + "entropy": 0.2794096082448959, + "epoch": 4.712103407755581, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.541865885257721, + "learning_rate": 2.980373152410952e-08, + "loss": -0.0058, + "num_tokens": 550526655.0, + "reward": 0.9447916746139526, + "reward_std": 0.07777333706617355, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9447916746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.13151190429925919, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000796794891358, + "sampling/importance_sampling_ratio/min": 0.3717313528060913, + "sampling/sampling_logp_difference/max": 1.1467663288116454, + "sampling/sampling_logp_difference/mean": 0.013969559781253338, + "step": 4010 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1709.2, + "completions/max_terminated_length": 1709.2, + "completions/mean_length": 1249.7875, + "completions/mean_terminated_length": 1249.7875, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.28808929324150084, + "epoch": 4.717978848413631, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.47011598944664, + "learning_rate": 2.9197964623212987e-08, + "loss": -0.0053, + "num_tokens": 551228443.0, + "reward": 0.8390625, + "reward_std": 0.05702723562717438, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8390625, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24377740174531937, + "sampling/importance_sampling_ratio/max": 1.9100152969360351, + "sampling/importance_sampling_ratio/mean": 0.9998533844947814, + "sampling/importance_sampling_ratio/min": 0.31113848388195037, + "sampling/sampling_logp_difference/max": 1.2729925632476806, + "sampling/sampling_logp_difference/mean": 0.014077169820666313, + "step": 4015 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1533.0, + "completions/max_terminated_length": 1533.0, + "completions/mean_length": 1139.98125, + "completions/mean_terminated_length": 1139.98125, + "completions/min_length": 857.2, + "completions/min_terminated_length": 857.2, + "entropy": 0.2606339991092682, + "epoch": 4.723854289071681, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6212969422340393, + "learning_rate": 2.859219772231645e-08, + "loss": 0.003, + "num_tokens": 551908709.0, + "reward": 0.86171875, + "reward_std": 0.05891707688570023, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.86171875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25346395522356036, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000167965888977, + "sampling/importance_sampling_ratio/min": 0.2859252363443375, + "sampling/sampling_logp_difference/max": 1.3241569757461549, + "sampling/sampling_logp_difference/mean": 0.013379177823662759, + "step": 4020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1564.2, + "completions/max_terminated_length": 1564.2, + "completions/mean_length": 1149.69375, + "completions/mean_terminated_length": 1149.69375, + "completions/min_length": 864.8, + "completions/min_terminated_length": 864.8, + "entropy": 0.2674075663089752, + "epoch": 4.72972972972973, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.6235418915748596, + "learning_rate": 2.7986430821419916e-08, + "loss": -0.0026, + "num_tokens": 552624643.0, + "reward": 0.7882812559604645, + "reward_std": 0.05237821713089943, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7882812559604645, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2877985715866089, + "sampling/importance_sampling_ratio/max": 1.906586217880249, + "sampling/importance_sampling_ratio/mean": 0.9999934434890747, + "sampling/importance_sampling_ratio/min": 0.45421077609062194, + "sampling/sampling_logp_difference/max": 0.8108912110328674, + "sampling/sampling_logp_difference/mean": 0.013526509329676629, + "step": 4025 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1647.6, + "completions/max_terminated_length": 1647.6, + "completions/mean_length": 1204.103125, + "completions/mean_terminated_length": 1204.103125, + "completions/min_length": 949.2, + "completions/min_terminated_length": 949.2, + "entropy": 0.27815998792648317, + "epoch": 4.735605170387779, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.7050502896308899, + "learning_rate": 2.7380663920523383e-08, + "loss": 0.0019, + "num_tokens": 553323572.0, + "reward": 0.8698958396911621, + "reward_std": 0.03121014088392258, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8698958396911621, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16822029352188111, + "sampling/importance_sampling_ratio/max": 1.90050311088562, + "sampling/importance_sampling_ratio/mean": 1.000030755996704, + "sampling/importance_sampling_ratio/min": 0.3887382984161377, + "sampling/sampling_logp_difference/max": 0.9473320245742798, + "sampling/sampling_logp_difference/mean": 0.013588633574545383, + "step": 4030 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1697.4, + "completions/max_terminated_length": 1697.4, + "completions/mean_length": 1214.74375, + "completions/mean_terminated_length": 1214.74375, + "completions/min_length": 905.8, + "completions/min_terminated_length": 905.8, + "entropy": 0.2639480948448181, + "epoch": 4.741480611045828, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.4301367998123169, + "learning_rate": 2.6774897019626845e-08, + "loss": -0.0019, + "num_tokens": 554043042.0, + "reward": 0.8942708492279052, + "reward_std": 0.08395053669810296, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8942708492279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21787521839141846, + "sampling/importance_sampling_ratio/max": 1.9720707178115844, + "sampling/importance_sampling_ratio/mean": 1.0000418543815612, + "sampling/importance_sampling_ratio/min": 0.3419287145137787, + "sampling/sampling_logp_difference/max": 1.1653326988220214, + "sampling/sampling_logp_difference/mean": 0.013334690406918526, + "step": 4035 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1683.0, + "completions/max_terminated_length": 1683.0, + "completions/mean_length": 1188.94375, + "completions/mean_terminated_length": 1188.94375, + "completions/min_length": 851.4, + "completions/min_terminated_length": 851.4, + "entropy": 0.26343382298946383, + "epoch": 4.7473560517038775, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.6100848317146301, + "learning_rate": 2.616913011873031e-08, + "loss": 0.0105, + "num_tokens": 554709632.0, + "reward": 0.8104166865348816, + "reward_std": 0.09645045325160026, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8104166865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24818702638149262, + "sampling/importance_sampling_ratio/max": 1.929994034767151, + "sampling/importance_sampling_ratio/mean": 0.9999106764793396, + "sampling/importance_sampling_ratio/min": 0.4044054388999939, + "sampling/sampling_logp_difference/max": 0.9244142293930053, + "sampling/sampling_logp_difference/mean": 0.013344330713152885, + "step": 4040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1816.0, + "completions/max_terminated_length": 1816.0, + "completions/mean_length": 1253.66875, + "completions/mean_terminated_length": 1253.66875, + "completions/min_length": 878.6, + "completions/min_terminated_length": 878.6, + "entropy": 0.2727950155735016, + "epoch": 4.753231492361927, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5697612762451172, + "learning_rate": 2.5563363217833778e-08, + "loss": -0.0015, + "num_tokens": 555431734.0, + "reward": 0.7989583492279053, + "reward_std": 0.08472683280706406, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7989583492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3380668729543686, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000714182853698, + "sampling/importance_sampling_ratio/min": 0.2469423845410347, + "sampling/sampling_logp_difference/max": 1.522918152809143, + "sampling/sampling_logp_difference/mean": 0.013825003430247307, + "step": 4045 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 1177.33125, + "completions/mean_terminated_length": 1177.33125, + "completions/min_length": 785.8, + "completions/min_terminated_length": 785.8, + "entropy": 0.27231194376945494, + "epoch": 4.759106933019977, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.38864025473594666, + "learning_rate": 2.495759631693724e-08, + "loss": 0.0024, + "num_tokens": 556151040.0, + "reward": 0.8744791984558106, + "reward_std": 0.0681779682636261, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8744791984558106, + "rewards/e2e_recall_precision_mixed_reward/std": 0.20191093385219575, + "sampling/importance_sampling_ratio/max": 1.9608511447906494, + "sampling/importance_sampling_ratio/mean": 1.0000580906867982, + "sampling/importance_sampling_ratio/min": 0.33743279576301577, + "sampling/sampling_logp_difference/max": 1.128471803665161, + "sampling/sampling_logp_difference/mean": 0.013923012092709542, + "step": 4050 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1602.0, + "completions/max_terminated_length": 1602.0, + "completions/mean_length": 1198.00625, + "completions/mean_terminated_length": 1198.00625, + "completions/min_length": 891.2, + "completions/min_terminated_length": 891.2, + "entropy": 0.27365310192108155, + "epoch": 4.764982373678026, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.35927513241767883, + "learning_rate": 2.4351829416040707e-08, + "loss": 0.0013, + "num_tokens": 556885026.0, + "reward": 0.8122395992279052, + "reward_std": 0.10333790630102158, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8122395992279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2743197739124298, + "sampling/importance_sampling_ratio/max": 1.9623225688934327, + "sampling/importance_sampling_ratio/mean": 1.000006079673767, + "sampling/importance_sampling_ratio/min": 0.36535613536834716, + "sampling/sampling_logp_difference/max": 1.1020445585250855, + "sampling/sampling_logp_difference/mean": 0.013802669942378998, + "step": 4055 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1808.0, + "completions/max_terminated_length": 1699.6, + "completions/mean_length": 1243.16875, + "completions/mean_terminated_length": 1239.767333984375, + "completions/min_length": 923.4, + "completions/min_terminated_length": 923.4, + "entropy": 0.2812578797340393, + "epoch": 4.770857814336075, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6282638311386108, + "learning_rate": 2.374606251514417e-08, + "loss": -0.0145, + "num_tokens": 557614548.0, + "reward": 0.8213541626930236, + "reward_std": 0.06305044703185558, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8213541746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2784075617790222, + "sampling/importance_sampling_ratio/max": 1.9655727863311767, + "sampling/importance_sampling_ratio/mean": 1.0000412583351135, + "sampling/importance_sampling_ratio/min": 0.3901285082101822, + "sampling/sampling_logp_difference/max": 1.2020664453506469, + "sampling/sampling_logp_difference/mean": 0.01414027102291584, + "step": 4060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1658.0, + "completions/max_terminated_length": 1658.0, + "completions/mean_length": 1201.15, + "completions/mean_terminated_length": 1201.15, + "completions/min_length": 886.6, + "completions/min_terminated_length": 886.6, + "entropy": 0.25774570405483244, + "epoch": 4.776733254994125, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6430007815361023, + "learning_rate": 2.3140295614247636e-08, + "loss": 0.0008, + "num_tokens": 558341524.0, + "reward": 0.979687511920929, + "reward_std": 0.04776434972882271, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.979687511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.07553751319646836, + "sampling/importance_sampling_ratio/max": 1.9550664186477662, + "sampling/importance_sampling_ratio/mean": 1.0000919699668884, + "sampling/importance_sampling_ratio/min": 0.3034312278032303, + "sampling/sampling_logp_difference/max": 1.3194203138351441, + "sampling/sampling_logp_difference/mean": 0.013247447088360786, + "step": 4065 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1630.2, + "completions/max_terminated_length": 1630.2, + "completions/mean_length": 1149.659375, + "completions/mean_terminated_length": 1149.659375, + "completions/min_length": 857.8, + "completions/min_terminated_length": 857.8, + "entropy": 0.26009349822998046, + "epoch": 4.782608695652174, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5895561575889587, + "learning_rate": 2.2534528713351102e-08, + "loss": 0.0013, + "num_tokens": 559041063.0, + "reward": 0.8940104246139526, + "reward_std": 0.08193137794733048, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8940104246139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.18850080221891402, + "sampling/importance_sampling_ratio/max": 1.9591300249099732, + "sampling/importance_sampling_ratio/mean": 0.9999151587486267, + "sampling/importance_sampling_ratio/min": 0.2794345647096634, + "sampling/sampling_logp_difference/max": 1.3429698705673219, + "sampling/sampling_logp_difference/mean": 0.013466096855700016, + "step": 4070 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1634.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 1177.80625, + "completions/mean_terminated_length": 1177.80625, + "completions/min_length": 950.8, + "completions/min_terminated_length": 950.8, + "entropy": 0.2671573221683502, + "epoch": 4.788484136310223, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.43248382210731506, + "learning_rate": 2.1928761812454565e-08, + "loss": -0.0063, + "num_tokens": 559741417.0, + "reward": 0.8859375238418579, + "reward_std": 0.05752565562725067, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8859375238418579, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2233549490571022, + "sampling/importance_sampling_ratio/max": 1.9804691076278687, + "sampling/importance_sampling_ratio/mean": 1.0001835584640504, + "sampling/importance_sampling_ratio/min": 0.43283228278160096, + "sampling/sampling_logp_difference/max": 0.8982840418815613, + "sampling/sampling_logp_difference/mean": 0.013595516420900821, + "step": 4075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1527.2, + "completions/max_terminated_length": 1527.2, + "completions/mean_length": 1168.15, + "completions/mean_terminated_length": 1168.15, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "entropy": 0.2654030591249466, + "epoch": 4.794359576968272, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6133489012718201, + "learning_rate": 2.132299491155803e-08, + "loss": 0.0051, + "num_tokens": 560460969.0, + "reward": 0.8815104365348816, + "reward_std": 0.08448518216609954, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8815104365348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16949040293693543, + "sampling/importance_sampling_ratio/max": 1.931166696548462, + "sampling/importance_sampling_ratio/mean": 1.0000255107879639, + "sampling/importance_sampling_ratio/min": 0.3363965079188347, + "sampling/sampling_logp_difference/max": 1.3793178796768188, + "sampling/sampling_logp_difference/mean": 0.013709034956991672, + "step": 4080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1654.8, + "completions/max_terminated_length": 1654.8, + "completions/mean_length": 1204.725, + "completions/mean_terminated_length": 1204.725, + "completions/min_length": 909.8, + "completions/min_terminated_length": 909.8, + "entropy": 0.2663749247789383, + "epoch": 4.800235017626322, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.6036227941513062, + "learning_rate": 2.0717228010661497e-08, + "loss": 0.004, + "num_tokens": 561160193.0, + "reward": 0.9296875, + "reward_std": 0.06299812644720078, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9296875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.11461469382047654, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000427842140198, + "sampling/importance_sampling_ratio/min": 0.29730011597275735, + "sampling/sampling_logp_difference/max": 1.5401460886001588, + "sampling/sampling_logp_difference/mean": 0.01355595700442791, + "step": 4085 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1699.2, + "completions/max_terminated_length": 1699.2, + "completions/mean_length": 1228.4375, + "completions/mean_terminated_length": 1228.4375, + "completions/min_length": 936.4, + "completions/min_terminated_length": 936.4, + "entropy": 0.26536994278430937, + "epoch": 4.806110458284372, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4142984449863434, + "learning_rate": 2.011146110976496e-08, + "loss": 0.0049, + "num_tokens": 561858797.0, + "reward": 0.8854166865348816, + "reward_std": 0.07202789336442947, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8854166865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.184259794652462, + "sampling/importance_sampling_ratio/max": 1.9609678506851196, + "sampling/importance_sampling_ratio/mean": 1.0000285267829896, + "sampling/importance_sampling_ratio/min": 0.2662481516599655, + "sampling/sampling_logp_difference/max": 1.6673511743545533, + "sampling/sampling_logp_difference/mean": 0.01343822181224823, + "step": 4090 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1498.8, + "completions/max_terminated_length": 1498.8, + "completions/mean_length": 1137.05625, + "completions/mean_terminated_length": 1137.05625, + "completions/min_length": 809.4, + "completions/min_terminated_length": 809.4, + "entropy": 0.265997040271759, + "epoch": 4.811985898942421, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6153071522712708, + "learning_rate": 1.9505694208868426e-08, + "loss": -0.0008, + "num_tokens": 562600847.0, + "reward": 0.8677083492279053, + "reward_std": 0.06810193136334419, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8677083492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23375769779086114, + "sampling/importance_sampling_ratio/max": 1.9153061866760255, + "sampling/importance_sampling_ratio/mean": 0.9999323844909668, + "sampling/importance_sampling_ratio/min": 0.4602263569831848, + "sampling/sampling_logp_difference/max": 0.876070213317871, + "sampling/sampling_logp_difference/mean": 0.013519445993006229, + "step": 4095 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1677.6, + "completions/max_terminated_length": 1677.6, + "completions/mean_length": 1153.478125, + "completions/mean_terminated_length": 1153.478125, + "completions/min_length": 829.2, + "completions/min_terminated_length": 829.2, + "entropy": 0.2523133546113968, + "epoch": 4.81786133960047, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.8459767699241638, + "learning_rate": 1.889992730797189e-08, + "loss": -0.0005, + "num_tokens": 563335752.0, + "reward": 0.8229166746139527, + "reward_std": 0.10837896019220353, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8229166746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2653172373771667, + "sampling/importance_sampling_ratio/max": 1.9938855409622191, + "sampling/importance_sampling_ratio/mean": 1.0000988125801087, + "sampling/importance_sampling_ratio/min": 0.3662609428167343, + "sampling/sampling_logp_difference/max": 1.1286290645599366, + "sampling/sampling_logp_difference/mean": 0.013645793497562408, + "step": 4100 + }, + { + "epoch": 4.81786133960047, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1599.92, + "eval_completions/max_terminated_length": 1599.92, + "eval_completions/mean_length": 1176.245625, + "eval_completions/mean_terminated_length": 1176.245625, + "eval_completions/min_length": 859.8, + "eval_completions/min_terminated_length": 859.8, + "eval_entropy": 0.2675222271680832, + "eval_frac_reward_zero_std": 0.61, + "eval_loss": 0.0027719642966985703, + "eval_num_tokens": 563335752.0, + "eval_reward": 0.7684583449363709, + "eval_reward_std": 0.08029463842511177, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7684583449363709, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.3004398235678673, + "eval_runtime": 443.3249, + "eval_samples_per_second": 0.226, + "eval_sampling/importance_sampling_ratio/max": 1.9567827081680298, + "eval_sampling/importance_sampling_ratio/mean": 1.0000646901130676, + "eval_sampling/importance_sampling_ratio/min": 0.33781426847563223, + "eval_sampling/sampling_logp_difference/max": 2.017117133140564, + "eval_sampling/sampling_logp_difference/mean": 0.01359828345477581, + "eval_steps_per_second": 0.005, + "step": 4100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1556.4, + "completions/max_terminated_length": 1556.4, + "completions/mean_length": 1202.3125, + "completions/mean_terminated_length": 1202.3125, + "completions/min_length": 947.2, + "completions/min_terminated_length": 947.2, + "entropy": 0.24352549016475677, + "epoch": 4.823736780258519, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.39156123995780945, + "learning_rate": 1.8294160407075355e-08, + "loss": 0.0027, + "num_tokens": 564054156.0, + "reward": 0.7713541746139526, + "reward_std": 0.06558299511671066, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7713541746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2723333746194839, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000266551971435, + "sampling/importance_sampling_ratio/min": 0.2960414350032806, + "sampling/sampling_logp_difference/max": 1.4837098360061645, + "sampling/sampling_logp_difference/mean": 0.0127076530829072, + "step": 4105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1613.4, + "completions/max_terminated_length": 1613.4, + "completions/mean_length": 1188.05625, + "completions/mean_terminated_length": 1188.05625, + "completions/min_length": 928.0, + "completions/min_terminated_length": 928.0, + "entropy": 0.26019893288612367, + "epoch": 4.829612220916569, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5982215404510498, + "learning_rate": 1.768839350617882e-08, + "loss": -0.0017, + "num_tokens": 564766574.0, + "reward": 0.7906771063804626, + "reward_std": 0.06684140712022782, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7906771063804626, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3481956601142883, + "sampling/importance_sampling_ratio/max": 1.9639420986175538, + "sampling/importance_sampling_ratio/mean": 0.9999716877937317, + "sampling/importance_sampling_ratio/min": 0.21439356505870819, + "sampling/sampling_logp_difference/max": 1.8027127265930176, + "sampling/sampling_logp_difference/mean": 0.013319421000778675, + "step": 4110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1601.8, + "completions/max_terminated_length": 1601.8, + "completions/mean_length": 1161.0375, + "completions/mean_terminated_length": 1161.0375, + "completions/min_length": 875.8, + "completions/min_terminated_length": 875.8, + "entropy": 0.2612924247980118, + "epoch": 4.835487661574618, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 1.7082626605282284e-08, + "loss": -0.0, + "num_tokens": 565466730.0, + "reward": 0.86796875, + "reward_std": 0.08403022512793541, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.86796875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1859685465693474, + "sampling/importance_sampling_ratio/max": 1.8856915473937987, + "sampling/importance_sampling_ratio/mean": 1.0000352263450623, + "sampling/importance_sampling_ratio/min": 0.3413155991952294, + "sampling/sampling_logp_difference/max": 3.9962790966033936, + "sampling/sampling_logp_difference/mean": 0.013213282637298106, + "step": 4115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 1756.0, + "completions/mean_length": 1275.734375, + "completions/mean_terminated_length": 1272.3380126953125, + "completions/min_length": 923.4, + "completions/min_terminated_length": 923.4, + "entropy": 0.2845246493816376, + "epoch": 4.841363102232667, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.4948183298110962, + "learning_rate": 1.647685970438575e-08, + "loss": -0.0046, + "num_tokens": 566217121.0, + "reward": 0.8736979246139527, + "reward_std": 0.05216329097747803, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8736979246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.22361454963684083, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000300645828246, + "sampling/importance_sampling_ratio/min": 0.3315484285354614, + "sampling/sampling_logp_difference/max": 1.25800461769104, + "sampling/sampling_logp_difference/mean": 0.014182769320905208, + "step": 4120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1612.6, + "completions/max_terminated_length": 1612.6, + "completions/mean_length": 1163.034375, + "completions/mean_terminated_length": 1163.034375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.26198195815086367, + "epoch": 4.847238542890716, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.4309649169445038, + "learning_rate": 1.5871092803489217e-08, + "loss": 0.0025, + "num_tokens": 566941612.0, + "reward": 0.7827604353427887, + "reward_std": 0.05432678535580635, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7827604353427887, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24770253524184227, + "sampling/importance_sampling_ratio/max": 1.9743736267089844, + "sampling/importance_sampling_ratio/mean": 0.9999311327934265, + "sampling/importance_sampling_ratio/min": 0.17661737089511007, + "sampling/sampling_logp_difference/max": 2.602937865257263, + "sampling/sampling_logp_difference/mean": 0.013681701943278312, + "step": 4125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1707.6, + "completions/max_terminated_length": 1707.6, + "completions/mean_length": 1223.121875, + "completions/mean_terminated_length": 1223.121875, + "completions/min_length": 950.2, + "completions/min_terminated_length": 950.2, + "entropy": 0.25878497362136843, + "epoch": 4.853113983548766, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.5751305222511292, + "learning_rate": 1.5265325902592683e-08, + "loss": -0.0006, + "num_tokens": 567651027.0, + "reward": 0.9020833373069763, + "reward_std": 0.031220474932342767, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9020833373069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17570360600948334, + "sampling/importance_sampling_ratio/max": 1.9646418809890747, + "sampling/importance_sampling_ratio/mean": 1.000183892250061, + "sampling/importance_sampling_ratio/min": 0.319706991314888, + "sampling/sampling_logp_difference/max": 1.356735897064209, + "sampling/sampling_logp_difference/mean": 0.013199831359088421, + "step": 4130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1738.2, + "completions/max_terminated_length": 1738.2, + "completions/mean_length": 1208.278125, + "completions/mean_terminated_length": 1208.278125, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "entropy": 0.25107733011245725, + "epoch": 4.858989424206816, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6848601698875427, + "learning_rate": 1.4659559001696147e-08, + "loss": -0.0032, + "num_tokens": 568384108.0, + "reward": 0.8515625119209289, + "reward_std": 0.06289163380861282, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8515625119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.270400308072567, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0002090692520142, + "sampling/importance_sampling_ratio/min": 0.3987681746482849, + "sampling/sampling_logp_difference/max": 1.137919044494629, + "sampling/sampling_logp_difference/mean": 0.012771274335682392, + "step": 4135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1655.8, + "completions/max_terminated_length": 1655.8, + "completions/mean_length": 1167.203125, + "completions/mean_terminated_length": 1167.203125, + "completions/min_length": 904.4, + "completions/min_terminated_length": 904.4, + "entropy": 0.26398766040802, + "epoch": 4.864864864864865, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0, + "learning_rate": 1.4053792100799612e-08, + "loss": 0.0105, + "num_tokens": 569108429.0, + "reward": 0.7697916865348816, + "reward_std": 0.0878813236951828, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7697916865348816, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3016296923160553, + "sampling/importance_sampling_ratio/max": 1.8767553567886353, + "sampling/importance_sampling_ratio/mean": 0.9999534130096436, + "sampling/importance_sampling_ratio/min": 0.3066461071372032, + "sampling/sampling_logp_difference/max": 1.3338868379592896, + "sampling/sampling_logp_difference/mean": 0.013632268644869328, + "step": 4140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1630.6, + "completions/max_terminated_length": 1630.6, + "completions/mean_length": 1170.74375, + "completions/mean_terminated_length": 1170.74375, + "completions/min_length": 885.8, + "completions/min_terminated_length": 885.8, + "entropy": 0.24844418168067933, + "epoch": 4.870740305522914, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5103340148925781, + "learning_rate": 1.3448025199903076e-08, + "loss": 0.0057, + "num_tokens": 569776331.0, + "reward": 0.9598958373069764, + "reward_std": 0.04087250307202339, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9598958373069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.11710455864667893, + "sampling/importance_sampling_ratio/max": 1.9135148763656615, + "sampling/importance_sampling_ratio/mean": 1.0000514268875123, + "sampling/importance_sampling_ratio/min": 0.3595329821109772, + "sampling/sampling_logp_difference/max": 1.1677767515182496, + "sampling/sampling_logp_difference/mean": 0.012815902382135392, + "step": 4145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1628.6, + "completions/max_terminated_length": 1628.6, + "completions/mean_length": 1177.821875, + "completions/mean_terminated_length": 1177.821875, + "completions/min_length": 863.4, + "completions/min_terminated_length": 863.4, + "entropy": 0.2565174579620361, + "epoch": 4.876615746180963, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.612808108329773, + "learning_rate": 1.2842258299006541e-08, + "loss": -0.0011, + "num_tokens": 570475090.0, + "reward": 0.8589062571525574, + "reward_std": 0.06039836704730987, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8589062571525574, + "rewards/e2e_recall_precision_mixed_reward/std": 0.24054351449012756, + "sampling/importance_sampling_ratio/max": 1.9100402116775512, + "sampling/importance_sampling_ratio/mean": 0.9999492049217225, + "sampling/importance_sampling_ratio/min": 0.3987319231033325, + "sampling/sampling_logp_difference/max": 1.0621936082839967, + "sampling/sampling_logp_difference/mean": 0.013036507740616798, + "step": 4150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1603.8, + "completions/max_terminated_length": 1603.8, + "completions/mean_length": 1136.215625, + "completions/mean_terminated_length": 1136.215625, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "entropy": 0.2555226027965546, + "epoch": 4.882491186839013, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.4172721207141876, + "learning_rate": 1.2236491398110007e-08, + "loss": 0.0025, + "num_tokens": 571161863.0, + "reward": 0.9296875, + "reward_std": 0.056744667887687686, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9296875, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1651596575975418, + "sampling/importance_sampling_ratio/max": 1.9613843202590941, + "sampling/importance_sampling_ratio/mean": 1.000058114528656, + "sampling/importance_sampling_ratio/min": 0.2753281805664301, + "sampling/sampling_logp_difference/max": 1.6068052053451538, + "sampling/sampling_logp_difference/mean": 0.013144350983202458, + "step": 4155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1938.4, + "completions/max_terminated_length": 1938.4, + "completions/mean_length": 1309.365625, + "completions/mean_terminated_length": 1309.365625, + "completions/min_length": 966.0, + "completions/min_terminated_length": 966.0, + "entropy": 0.2880063831806183, + "epoch": 4.888366627497062, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5996348857879639, + "learning_rate": 1.1630724497213472e-08, + "loss": 0.0025, + "num_tokens": 571925932.0, + "reward": 0.785937511920929, + "reward_std": 0.10168065577745437, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.785937511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.33307317793369295, + "sampling/importance_sampling_ratio/max": 1.990170383453369, + "sampling/importance_sampling_ratio/mean": 1.0000520944595337, + "sampling/importance_sampling_ratio/min": 0.3627611517906189, + "sampling/sampling_logp_difference/max": 1.0843160629272461, + "sampling/sampling_logp_difference/mean": 0.014332829415798188, + "step": 4160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1756.4, + "completions/max_terminated_length": 1756.4, + "completions/mean_length": 1272.540625, + "completions/mean_terminated_length": 1272.540625, + "completions/min_length": 909.8, + "completions/min_terminated_length": 909.8, + "entropy": 0.28079177141189576, + "epoch": 4.894242068155112, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 1.1024957596316936e-08, + "loss": -0.0013, + "num_tokens": 572682153.0, + "reward": 0.7119791746139527, + "reward_std": 0.06243942677974701, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7119791746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3010064959526062, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999913239479065, + "sampling/importance_sampling_ratio/min": 0.24750588452134253, + "sampling/sampling_logp_difference/max": 3.2225181341171263, + "sampling/sampling_logp_difference/mean": 0.014154410175979137, + "step": 4165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1553.8, + "completions/max_terminated_length": 1553.8, + "completions/mean_length": 1216.925, + "completions/mean_terminated_length": 1216.925, + "completions/min_length": 975.8, + "completions/min_terminated_length": 975.8, + "entropy": 0.2597874790430069, + "epoch": 4.900117508813161, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.6187815070152283, + "learning_rate": 1.04191906954204e-08, + "loss": -0.0021, + "num_tokens": 573397233.0, + "reward": 0.9138020992279052, + "reward_std": 0.0694403514266014, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9138020992279052, + "rewards/e2e_recall_precision_mixed_reward/std": 0.15716297775506974, + "sampling/importance_sampling_ratio/max": 1.9616039276123047, + "sampling/importance_sampling_ratio/mean": 1.000029969215393, + "sampling/importance_sampling_ratio/min": 0.3573975801467896, + "sampling/sampling_logp_difference/max": 1.1084071159362794, + "sampling/sampling_logp_difference/mean": 0.013234620355069638, + "step": 4170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1742.2, + "completions/max_terminated_length": 1742.2, + "completions/mean_length": 1258.2375, + "completions/mean_terminated_length": 1258.2375, + "completions/min_length": 925.2, + "completions/min_terminated_length": 925.2, + "entropy": 0.27962875962257383, + "epoch": 4.9059929494712105, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.5773242115974426, + "learning_rate": 9.813423794523867e-09, + "loss": -0.0023, + "num_tokens": 574103741.0, + "reward": 0.9208333492279053, + "reward_std": 0.07986356988549233, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9208333492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.1817111313343048, + "sampling/importance_sampling_ratio/max": 1.8720341920852661, + "sampling/importance_sampling_ratio/mean": 1.0000491261482238, + "sampling/importance_sampling_ratio/min": 0.39683855772018434, + "sampling/sampling_logp_difference/max": 0.9288500308990478, + "sampling/sampling_logp_difference/mean": 0.013723740726709366, + "step": 4175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1788.6, + "completions/max_terminated_length": 1780.8, + "completions/mean_length": 1224.46875, + "completions/mean_terminated_length": 1221.0921875, + "completions/min_length": 906.4, + "completions/min_terminated_length": 906.4, + "entropy": 0.27353797256946566, + "epoch": 4.91186839012926, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5145543813705444, + "learning_rate": 9.207656893627331e-09, + "loss": -0.007, + "num_tokens": 574809695.0, + "reward": 0.7510416746139527, + "reward_std": 0.06682446748018264, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7510416746139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.3081136792898178, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999987125396729, + "sampling/importance_sampling_ratio/min": 0.3361870855093002, + "sampling/sampling_logp_difference/max": 1.3072755098342896, + "sampling/sampling_logp_difference/mean": 0.013947272859513759, + "step": 4180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1716.6, + "completions/max_terminated_length": 1716.6, + "completions/mean_length": 1253.65, + "completions/mean_terminated_length": 1253.65, + "completions/min_length": 966.6, + "completions/min_terminated_length": 966.6, + "entropy": 0.28181648850440977, + "epoch": 4.917743830787309, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 8.601889992730796e-09, + "loss": 0.002, + "num_tokens": 575558047.0, + "reward": 0.7856770873069763, + "reward_std": 0.03310954719781876, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7856770873069763, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2692906141281128, + "sampling/importance_sampling_ratio/max": 1.9965229749679565, + "sampling/importance_sampling_ratio/mean": 0.9999869465827942, + "sampling/importance_sampling_ratio/min": 0.26230895724147557, + "sampling/sampling_logp_difference/max": 1.7456336498260498, + "sampling/sampling_logp_difference/mean": 0.014158726483583451, + "step": 4185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1530.2, + "completions/max_terminated_length": 1530.2, + "completions/mean_length": 1196.39375, + "completions/mean_terminated_length": 1196.39375, + "completions/min_length": 923.0, + "completions/min_terminated_length": 923.0, + "entropy": 0.26490248143672945, + "epoch": 4.923619271445358, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5220557451248169, + "learning_rate": 7.99612309183426e-09, + "loss": -0.0034, + "num_tokens": 576251005.0, + "reward": 0.8921875119209289, + "reward_std": 0.06537900567054748, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8921875119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21443742215633393, + "sampling/importance_sampling_ratio/max": 1.888991355895996, + "sampling/importance_sampling_ratio/mean": 0.999981415271759, + "sampling/importance_sampling_ratio/min": 0.31290013790130616, + "sampling/sampling_logp_difference/max": 1.2028279781341553, + "sampling/sampling_logp_difference/mean": 0.013373297452926636, + "step": 4190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1618.0, + "completions/max_terminated_length": 1618.0, + "completions/mean_length": 1164.04375, + "completions/mean_terminated_length": 1164.04375, + "completions/min_length": 917.8, + "completions/min_terminated_length": 917.8, + "entropy": 0.24597953855991364, + "epoch": 4.929494712103407, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.49730363488197327, + "learning_rate": 7.3903561909377266e-09, + "loss": 0.0001, + "num_tokens": 576939563.0, + "reward": 0.7997395873069764, + "reward_std": 0.1168543741106987, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.7997395873069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.27827770859003065, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0000683307647704, + "sampling/importance_sampling_ratio/min": 0.37277783155441285, + "sampling/sampling_logp_difference/max": 1.186050796508789, + "sampling/sampling_logp_difference/mean": 0.01289830356836319, + "step": 4195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1618.8, + "completions/max_terminated_length": 1618.8, + "completions/mean_length": 1225.7625, + "completions/mean_terminated_length": 1225.7625, + "completions/min_length": 839.8, + "completions/min_terminated_length": 839.8, + "entropy": 0.2759159684181213, + "epoch": 4.9353701527614575, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.5151962041854858, + "learning_rate": 6.784589290041192e-09, + "loss": -0.0058, + "num_tokens": 577667935.0, + "reward": 0.940625011920929, + "reward_std": 0.06629720851778984, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.940625011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.14214378148317336, + "sampling/importance_sampling_ratio/max": 1.9627459526062012, + "sampling/importance_sampling_ratio/mean": 0.9999911308288574, + "sampling/importance_sampling_ratio/min": 0.34458776712417605, + "sampling/sampling_logp_difference/max": 1.183539056777954, + "sampling/sampling_logp_difference/mean": 0.013748652674257756, + "step": 4200 + }, + { + "epoch": 4.9353701527614575, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1622.84, + "eval_completions/max_terminated_length": 1622.84, + "eval_completions/mean_length": 1187.468125, + "eval_completions/mean_terminated_length": 1187.468125, + "eval_completions/min_length": 892.2, + "eval_completions/min_terminated_length": 892.2, + "eval_entropy": 0.27012341380119326, + "eval_frac_reward_zero_std": 0.65, + "eval_loss": 0.0018613528227433562, + "eval_num_tokens": 577667935.0, + "eval_reward": 0.7749166774749756, + "eval_reward_std": 0.07821166217327118, + "eval_rewards/e2e_recall_precision_mixed_reward/mean": 0.7749166774749756, + "eval_rewards/e2e_recall_precision_mixed_reward/std": 0.29328397393226624, + "eval_runtime": 453.8836, + "eval_samples_per_second": 0.22, + "eval_sampling/importance_sampling_ratio/max": 1.9473948335647584, + "eval_sampling/importance_sampling_ratio/mean": 0.9999662327766419, + "eval_sampling/importance_sampling_ratio/min": 0.3151546062529087, + "eval_sampling/sampling_logp_difference/max": 1.3373068857192993, + "eval_sampling/sampling_logp_difference/mean": 0.01363888442516327, + "eval_steps_per_second": 0.004, + "step": 4200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1647.2, + "completions/max_terminated_length": 1647.2, + "completions/mean_length": 1198.703125, + "completions/mean_terminated_length": 1198.703125, + "completions/min_length": 915.4, + "completions/min_terminated_length": 915.4, + "entropy": 0.2633301138877869, + "epoch": 4.941245593419507, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 6.1788223891446564e-09, + "loss": -0.0032, + "num_tokens": 578373168.0, + "reward": 0.9223958492279053, + "reward_std": 0.08152852952480316, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9223958492279053, + "rewards/e2e_recall_precision_mixed_reward/std": 0.152499720454216, + "sampling/importance_sampling_ratio/max": 1.9259612321853639, + "sampling/importance_sampling_ratio/mean": 0.9999021768569947, + "sampling/importance_sampling_ratio/min": 0.39434434175491334, + "sampling/sampling_logp_difference/max": 1.0410821914672852, + "sampling/sampling_logp_difference/mean": 0.013154360838234425, + "step": 4205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1669.0, + "completions/max_terminated_length": 1669.0, + "completions/mean_length": 1275.25, + "completions/mean_terminated_length": 1275.25, + "completions/min_length": 1009.8, + "completions/min_terminated_length": 1009.8, + "entropy": 0.2745696842670441, + "epoch": 4.947121034077556, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.5402174592018127, + "learning_rate": 5.573055488248122e-09, + "loss": 0.0021, + "num_tokens": 579086176.0, + "reward": 0.9057291746139526, + "reward_std": 0.06898038685321808, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9057291746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16145955622196198, + "sampling/importance_sampling_ratio/max": 1.9250396013259887, + "sampling/importance_sampling_ratio/mean": 0.9999170303344727, + "sampling/importance_sampling_ratio/min": 0.3829080641269684, + "sampling/sampling_logp_difference/max": 0.9843248128890991, + "sampling/sampling_logp_difference/mean": 0.013959725014865398, + "step": 4210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1524.2, + "completions/max_terminated_length": 1524.2, + "completions/mean_length": 1170.28125, + "completions/mean_terminated_length": 1170.28125, + "completions/min_length": 893.8, + "completions/min_terminated_length": 893.8, + "entropy": 0.2561034023761749, + "epoch": 4.952996474735605, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.0, + "learning_rate": 4.967288587351586e-09, + "loss": -0.0009, + "num_tokens": 579767418.0, + "reward": 0.9182291746139526, + "reward_std": 0.02048187479376793, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9182291746139526, + "rewards/e2e_recall_precision_mixed_reward/std": 0.16392614617943763, + "sampling/importance_sampling_ratio/max": 1.9762834310531616, + "sampling/importance_sampling_ratio/mean": 0.999981677532196, + "sampling/importance_sampling_ratio/min": 0.34127419590950014, + "sampling/sampling_logp_difference/max": 1.1679274797439576, + "sampling/sampling_logp_difference/mean": 0.013132588565349579, + "step": 4215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1799.6, + "completions/max_terminated_length": 1799.6, + "completions/mean_length": 1242.621875, + "completions/mean_terminated_length": 1242.621875, + "completions/min_length": 910.2, + "completions/min_terminated_length": 910.2, + "entropy": 0.2759708225727081, + "epoch": 4.9588719153936545, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.43732741475105286, + "learning_rate": 4.361521686455052e-09, + "loss": 0.0079, + "num_tokens": 580478561.0, + "reward": 0.8062500119209289, + "reward_std": 0.10620709657669067, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8062500119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.26027744710445405, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999176859855652, + "sampling/importance_sampling_ratio/min": 0.3623032122850418, + "sampling/sampling_logp_difference/max": 1.5115676403045655, + "sampling/sampling_logp_difference/mean": 0.013757564499974251, + "step": 4220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1617.6, + "completions/max_terminated_length": 1617.6, + "completions/mean_length": 1131.6875, + "completions/mean_terminated_length": 1131.6875, + "completions/min_length": 771.8, + "completions/min_terminated_length": 771.8, + "entropy": 0.2657478004693985, + "epoch": 4.964747356051704, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.5924484133720398, + "learning_rate": 3.755754785558517e-09, + "loss": -0.0043, + "num_tokens": 581201549.0, + "reward": 0.8257812619209289, + "reward_std": 0.08197875022888183, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8257812619209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.23682957887649536, + "sampling/importance_sampling_ratio/max": 1.966193675994873, + "sampling/importance_sampling_ratio/mean": 0.9998813509941101, + "sampling/importance_sampling_ratio/min": 0.3465057075023651, + "sampling/sampling_logp_difference/max": 1.3918760061264037, + "sampling/sampling_logp_difference/mean": 0.013801524788141251, + "step": 4225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1520.8, + "completions/max_terminated_length": 1520.8, + "completions/mean_length": 1192.865625, + "completions/mean_terminated_length": 1192.865625, + "completions/min_length": 935.4, + "completions/min_terminated_length": 935.4, + "entropy": 0.26043239533901213, + "epoch": 4.970622796709753, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.42304831743240356, + "learning_rate": 3.149987884661982e-09, + "loss": -0.0002, + "num_tokens": 581893074.0, + "reward": 0.887500011920929, + "reward_std": 0.05844337120652199, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.887500011920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.21982333362102507, + "sampling/importance_sampling_ratio/max": 1.8982154846191406, + "sampling/importance_sampling_ratio/mean": 0.9999570608139038, + "sampling/importance_sampling_ratio/min": 0.3804735541343689, + "sampling/sampling_logp_difference/max": 1.2776761054992676, + "sampling/sampling_logp_difference/mean": 0.013370229117572307, + "step": 4230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1726.0, + "completions/max_terminated_length": 1726.0, + "completions/mean_length": 1219.128125, + "completions/mean_terminated_length": 1219.128125, + "completions/min_length": 872.2, + "completions/min_terminated_length": 872.2, + "entropy": 0.25902561843395233, + "epoch": 4.976498237367803, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.0, + "learning_rate": 2.544220983765447e-09, + "loss": -0.002, + "num_tokens": 582591003.0, + "reward": 0.8218750119209289, + "reward_std": 0.07310913950204849, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8218750119209289, + "rewards/e2e_recall_precision_mixed_reward/std": 0.28861500322818756, + "sampling/importance_sampling_ratio/max": 1.9743704080581665, + "sampling/importance_sampling_ratio/mean": 0.9999637246131897, + "sampling/importance_sampling_ratio/min": 0.4172662615776062, + "sampling/sampling_logp_difference/max": 0.8795746564865112, + "sampling/sampling_logp_difference/mean": 0.013227641209959983, + "step": 4235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1684.2, + "completions/max_terminated_length": 1684.2, + "completions/mean_length": 1250.65, + "completions/mean_terminated_length": 1250.65, + "completions/min_length": 897.6, + "completions/min_terminated_length": 897.6, + "entropy": 0.27604796886444094, + "epoch": 4.982373678025852, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.0, + "learning_rate": 1.938454082868912e-09, + "loss": 0.0021, + "num_tokens": 583305659.0, + "reward": 0.9108333349227905, + "reward_std": 0.06361775994300842, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.9108333349227905, + "rewards/e2e_recall_precision_mixed_reward/std": 0.14266837537288665, + "sampling/importance_sampling_ratio/max": 1.9794315576553345, + "sampling/importance_sampling_ratio/mean": 0.9999402523040771, + "sampling/importance_sampling_ratio/min": 0.3792805254459381, + "sampling/sampling_logp_difference/max": 1.2729737639427186, + "sampling/sampling_logp_difference/mean": 0.013952986150979996, + "step": 4240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1632.8, + "completions/max_terminated_length": 1632.8, + "completions/mean_length": 1154.265625, + "completions/mean_terminated_length": 1154.265625, + "completions/min_length": 861.8, + "completions/min_terminated_length": 861.8, + "entropy": 0.2639856070280075, + "epoch": 4.9882491186839015, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.4559522569179535, + "learning_rate": 1.332687181972377e-09, + "loss": 0.0018, + "num_tokens": 583989680.0, + "reward": 0.879687511920929, + "reward_std": 0.04789134860038757, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.879687511920929, + "rewards/e2e_recall_precision_mixed_reward/std": 0.17645513415336608, + "sampling/importance_sampling_ratio/max": 1.9888315200805664, + "sampling/importance_sampling_ratio/mean": 0.9999840378761291, + "sampling/importance_sampling_ratio/min": 0.39148822128772737, + "sampling/sampling_logp_difference/max": 1.0263358354568481, + "sampling/sampling_logp_difference/mean": 0.01353690456598997, + "step": 4245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.003125, + "completions/max_length": 1750.6, + "completions/max_terminated_length": 1740.6, + "completions/mean_length": 1204.640625, + "completions/mean_terminated_length": 1201.0263427734376, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "entropy": 0.2623937726020813, + "epoch": 4.994124559341951, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.6408044099807739, + "learning_rate": 7.26920281075842e-10, + "loss": -0.0026, + "num_tokens": 584716969.0, + "reward": 0.8661458373069764, + "reward_std": 0.09312712252140046, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8661458373069764, + "rewards/e2e_recall_precision_mixed_reward/std": 0.25531432032585144, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9999866724014282, + "sampling/importance_sampling_ratio/min": 0.3677119523286819, + "sampling/sampling_logp_difference/max": 1.0782905578613282, + "sampling/sampling_logp_difference/mean": 0.013216838613152504, + "step": 4250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 1752.6, + "completions/max_terminated_length": 1748.2, + "completions/mean_length": 1165.646875, + "completions/mean_terminated_length": 1150.09833984375, + "completions/min_length": 852.2, + "completions/min_terminated_length": 852.2, + "entropy": 0.2466509908437729, + "epoch": 5.0, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.45128658413887024, + "learning_rate": 1.21153380179307e-10, + "loss": -0.0331, + "num_tokens": 585407944.0, + "reward": 0.8033854246139527, + "reward_std": 0.09269858747720719, + "rewards/e2e_recall_precision_mixed_reward/mean": 0.8033854246139527, + "rewards/e2e_recall_precision_mixed_reward/std": 0.2835982650518417, + "sampling/importance_sampling_ratio/max": 1.890273141860962, + "sampling/importance_sampling_ratio/mean": 1.000039005279541, + "sampling/importance_sampling_ratio/min": 0.3465990424156189, + "sampling/sampling_logp_difference/max": 1.0836182117462159, + "sampling/sampling_logp_difference/mean": 0.01285779345780611, + "step": 4255 + }, + { + "epoch": 5.0, + "step": 4255, + "total_flos": 0.0, + "train_loss": 0.0002628941724753198, + "train_runtime": 110827.1867, + "train_samples_per_second": 0.154, + "train_steps_per_second": 0.038 + } + ], + "logging_steps": 5, + "max_steps": 4255, + "num_input_tokens_seen": 585407944, + "num_train_epochs": 5, + "save_steps": 400, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}