diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6931 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.078, + "eval_steps": 500, + "global_step": 975, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.00555555559694767, + "clip_ratio/high_mean": 0.002777777798473835, + "clip_ratio/low_mean": 0.01785130724310875, + "clip_ratio/low_min": 0.00555555559694767, + "clip_ratio/region_mean": 0.0206290852278471, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 301.8375, + "completions/mean_terminated_length": 301.8375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.3019494503736496, + "epoch": 0.0004, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.12727217376232147, + "kl": 0.04473987314850092, + "learning_rate": 1.137216e-06, + "loss": -1.5935138799250125e-05, + "num_tokens": 68886.0, + "reward": 0.3598749995231628, + "reward_std": 0.2971616297960281, + "rewards/env_game_reward/mean": 0.3598749995231628, + "rewards/env_game_reward/std": 0.40898369550704955, + "sampling/importance_sampling_ratio/max": 1.8200685739517213, + "sampling/importance_sampling_ratio/mean": 0.9960397481918335, + "sampling/importance_sampling_ratio/min": 0.2915335774421692, + "sampling/sampling_logp_difference/max": 1.2875514030456543, + "sampling/sampling_logp_difference/mean": 0.07988147884607315, + "step": 5, + "step_time": 2.435299045799911 + }, + { + "clip_ratio/high_max": 0.02977941185235977, + "clip_ratio/high_mean": 0.014889705926179886, + "clip_ratio/low_mean": 0.019972265511751176, + "clip_ratio/low_min": 0.00555555559694767, + "clip_ratio/region_mean": 0.034861971624195576, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/max_terminated_length": 365.6, + "completions/mean_length": 293.7875, + "completions/mean_terminated_length": 293.7875, + "completions/min_length": 219.2, + "completions/min_terminated_length": 219.2, + "entropy": 0.3496206432580948, + "epoch": 0.0008, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.2412712723016739, + "kl": 0.021455740835517646, + "learning_rate": 2.5587359999999995e-06, + "loss": 0.0005658970680087805, + "num_tokens": 136575.0, + "reward": 0.38937501311302186, + "reward_std": 0.25508877336978913, + "rewards/env_game_reward/mean": 0.38937501311302186, + "rewards/env_game_reward/std": 0.37691490650177, + "sampling/importance_sampling_ratio/max": 1.777461338043213, + "sampling/importance_sampling_ratio/mean": 0.9673051834106445, + "sampling/importance_sampling_ratio/min": 0.18635750880930574, + "sampling/sampling_logp_difference/max": 2.5953728675842287, + "sampling/sampling_logp_difference/mean": 0.10190577432513237, + "step": 10, + "step_time": 2.0430383474005795 + }, + { + "clip_ratio/high_max": 0.012132352963089944, + "clip_ratio/high_mean": 0.006066176481544972, + "clip_ratio/low_mean": 0.008350533433258533, + "clip_ratio/low_min": 0.005263157933950424, + "clip_ratio/region_mean": 0.01441670972853899, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.8, + "completions/max_terminated_length": 373.8, + "completions/mean_length": 282.05, + "completions/mean_terminated_length": 282.05, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.36802791953086855, + "epoch": 0.0012, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.1376928836107254, + "kl": 0.08403404112905263, + "learning_rate": 3.9802559999999995e-06, + "loss": 0.0002614784985780716, + "num_tokens": 202248.0, + "reward": 0.3897500097751617, + "reward_std": 0.2549119979143143, + "rewards/env_game_reward/mean": 0.3897500097751617, + "rewards/env_game_reward/std": 0.414092218875885, + "sampling/importance_sampling_ratio/max": 1.847982144355774, + "sampling/importance_sampling_ratio/mean": 1.0205587506294251, + "sampling/importance_sampling_ratio/min": 0.4687712244689465, + "sampling/sampling_logp_difference/max": 1.0548641800880432, + "sampling/sampling_logp_difference/mean": 0.0759758085012436, + "step": 15, + "step_time": 2.0221517859987217 + }, + { + "clip_ratio/high_max": 0.023570261523127557, + "clip_ratio/high_mean": 0.011785130761563779, + "clip_ratio/low_mean": 0.03886523898690939, + "clip_ratio/low_min": 0.01670106649398804, + "clip_ratio/region_mean": 0.050650370121002194, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/max_terminated_length": 365.6, + "completions/mean_length": 297.625, + "completions/mean_terminated_length": 297.625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.3216476052999496, + "epoch": 0.0016, + "frac_reward_zero_std": 0.2, + "grad_norm": 4.7970099449157715, + "kl": 13.012010221928358, + "learning_rate": 5.401775999999999e-06, + "loss": 0.0002645790576934814, + "num_tokens": 271508.0, + "reward": 0.4497500121593475, + "reward_std": 0.4034044086933136, + "rewards/env_game_reward/mean": 0.4497500121593475, + "rewards/env_game_reward/std": 0.42049501538276673, + "sampling/importance_sampling_ratio/max": 1.9190717458724975, + "sampling/importance_sampling_ratio/mean": 0.9109017491340637, + "sampling/importance_sampling_ratio/min": 0.12080914080142975, + "sampling/sampling_logp_difference/max": 1.8006493330001831, + "sampling/sampling_logp_difference/mean": 0.11719217151403427, + "step": 20, + "step_time": 2.1648863314010667 + }, + { + "clip_ratio/high_max": 0.01180555559694767, + "clip_ratio/high_mean": 0.005902777798473835, + "clip_ratio/low_mean": 0.026298435963690282, + "clip_ratio/low_min": 0.010644257813692094, + "clip_ratio/region_mean": 0.032201213762164116, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/max_terminated_length": 365.6, + "completions/mean_length": 286.9375, + "completions/mean_terminated_length": 286.9375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.3577309399843216, + "epoch": 0.002, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.12569169700145721, + "kl": 0.5408080242574215, + "learning_rate": 6.8232959999999994e-06, + "loss": -0.0006784088443964719, + "num_tokens": 338918.0, + "reward": 0.48712502121925355, + "reward_std": 0.2656953662633896, + "rewards/env_game_reward/mean": 0.48712502121925355, + "rewards/env_game_reward/std": 0.37898687124252317, + "sampling/importance_sampling_ratio/max": 1.827208924293518, + "sampling/importance_sampling_ratio/mean": 0.95396728515625, + "sampling/importance_sampling_ratio/min": 0.2660827249288559, + "sampling/sampling_logp_difference/max": 1.3808974146842956, + "sampling/sampling_logp_difference/mean": 0.1048379361629486, + "step": 25, + "step_time": 2.043276316798438 + }, + { + "clip_ratio/high_max": 0.011513157933950424, + "clip_ratio/high_mean": 0.005756578966975212, + "clip_ratio/low_mean": 0.017378311045467852, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.023134890012443066, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.8, + "completions/max_terminated_length": 365.8, + "completions/mean_length": 278.425, + "completions/mean_terminated_length": 278.425, + "completions/min_length": 194.6, + "completions/min_terminated_length": 194.6, + "entropy": 0.5190394788980484, + "epoch": 0.0024, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.2680231034755707, + "kl": 0.7589091017842293, + "learning_rate": 8.244816e-06, + "loss": -2.776859328150749e-05, + "num_tokens": 404073.0, + "reward": 0.4043750107288361, + "reward_std": 0.25508877336978913, + "rewards/env_game_reward/mean": 0.4043750107288361, + "rewards/env_game_reward/std": 0.409240585565567, + "sampling/importance_sampling_ratio/max": 1.9664097785949708, + "sampling/importance_sampling_ratio/mean": 0.9075453758239747, + "sampling/importance_sampling_ratio/min": 0.20434444732964038, + "sampling/sampling_logp_difference/max": 1.4564614772796631, + "sampling/sampling_logp_difference/mean": 0.14034032225608825, + "step": 30, + "step_time": 1.9978871219995198 + }, + { + "clip_ratio/high_max": 0.02389705888926983, + "clip_ratio/high_mean": 0.011948529444634915, + "clip_ratio/low_mean": 0.011948529072105885, + "clip_ratio/low_min": 0.00625, + "clip_ratio/region_mean": 0.023897058516740798, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.4, + "completions/max_terminated_length": 357.4, + "completions/mean_length": 272.075, + "completions/mean_terminated_length": 272.075, + "completions/min_length": 166.4, + "completions/min_terminated_length": 166.4, + "entropy": 0.5090951889753341, + "epoch": 0.0028, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.07063741236925125, + "kl": 0.7235948637127876, + "learning_rate": 9.666336e-06, + "loss": -0.0005834823474287987, + "num_tokens": 468753.0, + "reward": 0.487250018119812, + "reward_std": 0.24430539906024934, + "rewards/env_game_reward/mean": 0.487250018119812, + "rewards/env_game_reward/std": 0.39645681977272035, + "sampling/importance_sampling_ratio/max": 1.7759217023849487, + "sampling/importance_sampling_ratio/mean": 0.8916664958000183, + "sampling/importance_sampling_ratio/min": 0.1796780303120613, + "sampling/sampling_logp_difference/max": 5.293339991569519, + "sampling/sampling_logp_difference/mean": 0.22543202042579652, + "step": 35, + "step_time": 1.970122839200485 + }, + { + "clip_ratio/high_max": 0.011764705926179887, + "clip_ratio/high_mean": 0.005882352963089943, + "clip_ratio/low_mean": 0.007465277798473835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.013347630761563778, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.8, + "completions/max_terminated_length": 367.8, + "completions/mean_length": 285.5, + "completions/mean_terminated_length": 285.5, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "entropy": 0.4019551068544388, + "epoch": 0.0032, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.05544561892747879, + "kl": 0.5415760695934295, + "learning_rate": 9.950639790096085e-06, + "loss": -0.0005978990811854601, + "num_tokens": 535867.0, + "reward": 0.6225000143051147, + "reward_std": 0.30759145617485045, + "rewards/env_game_reward/mean": 0.6225000143051147, + "rewards/env_game_reward/std": 0.42984657883644106, + "sampling/importance_sampling_ratio/max": 1.527849555015564, + "sampling/importance_sampling_ratio/mean": 0.8954729080200196, + "sampling/importance_sampling_ratio/min": 0.24045688807964324, + "sampling/sampling_logp_difference/max": 1.2608455896377564, + "sampling/sampling_logp_difference/mean": 0.09565315097570419, + "step": 40, + "step_time": 2.158330711200688 + }, + { + "clip_ratio/high_max": 0.011764705926179887, + "clip_ratio/high_mean": 0.005882352963089943, + "clip_ratio/low_mean": 0.0029411764815449716, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008823529444634914, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.4, + "completions/max_terminated_length": 374.4, + "completions/mean_length": 297.7125, + "completions/mean_terminated_length": 297.7125, + "completions/min_length": 206.8, + "completions/min_terminated_length": 206.8, + "entropy": 0.45966209173202516, + "epoch": 0.0036, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.10021962970495224, + "kl": 4.332460595667362, + "learning_rate": 9.950638937361476e-06, + "loss": 0.00036982442252337934, + "num_tokens": 604195.0, + "reward": 0.5772500157356262, + "reward_std": 0.22309219390153884, + "rewards/env_game_reward/mean": 0.5772500157356262, + "rewards/env_game_reward/std": 0.40507532358169557, + "sampling/importance_sampling_ratio/max": 2.0866220235824584, + "sampling/importance_sampling_ratio/mean": 0.9108154296875, + "sampling/importance_sampling_ratio/min": 0.22963042184710503, + "sampling/sampling_logp_difference/max": 1.6737257480621337, + "sampling/sampling_logp_difference/mean": 0.13707162737846373, + "step": 45, + "step_time": 2.022085135200905 + }, + { + "clip_ratio/high_max": 0.016993464156985282, + "clip_ratio/high_mean": 0.008496732078492641, + "clip_ratio/low_mean": 0.005882352963089943, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014379085041582584, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.2, + "completions/max_terminated_length": 374.2, + "completions/mean_length": 303.05, + "completions/mean_terminated_length": 303.05, + "completions/min_length": 234.2, + "completions/min_terminated_length": 234.2, + "entropy": 0.5295720070600509, + "epoch": 0.004, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.03144078329205513, + "kl": 0.7867398172616958, + "learning_rate": 9.950637428677324e-06, + "loss": -0.0003503738436847925, + "num_tokens": 673761.0, + "reward": 0.5325000166893006, + "reward_std": 0.26516503691673277, + "rewards/env_game_reward/mean": 0.5325000166893006, + "rewards/env_game_reward/std": 0.3944298326969147, + "sampling/importance_sampling_ratio/max": 1.7604175567626954, + "sampling/importance_sampling_ratio/mean": 1.0008994936943054, + "sampling/importance_sampling_ratio/min": 0.17906015515327453, + "sampling/sampling_logp_difference/max": 1.3992098808288573, + "sampling/sampling_logp_difference/mean": 0.11947420984506607, + "step": 50, + "step_time": 2.0436965212007636 + }, + { + "clip_ratio/high_max": 0.0125, + "clip_ratio/high_mean": 0.00625, + "clip_ratio/low_mean": 0.009375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.2, + "completions/max_terminated_length": 365.2, + "completions/mean_length": 266.475, + "completions/mean_terminated_length": 266.475, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.48174993991851806, + "epoch": 0.0044, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.08803996443748474, + "kl": 0.5854233294725418, + "learning_rate": 9.950635264043898e-06, + "loss": -0.0005686009302735328, + "num_tokens": 737069.0, + "reward": 0.6225000143051147, + "reward_std": 0.26516503989696505, + "rewards/env_game_reward/mean": 0.6225000143051147, + "rewards/env_game_reward/std": 0.3427117049694061, + "sampling/importance_sampling_ratio/max": 1.6551067352294921, + "sampling/importance_sampling_ratio/mean": 0.8545376658439636, + "sampling/importance_sampling_ratio/min": 0.25754888653755187, + "sampling/sampling_logp_difference/max": 1.1337799072265624, + "sampling/sampling_logp_difference/mean": 0.11593741178512573, + "step": 55, + "step_time": 1.9781528124010948 + }, + { + "clip_ratio/high_max": 0.00625, + "clip_ratio/high_mean": 0.003125, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.2, + "completions/max_terminated_length": 357.2, + "completions/mean_length": 297.975, + "completions/mean_terminated_length": 297.975, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.5426139056682586, + "epoch": 0.0048, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.05449790135025978, + "kl": 0.9533621698617936, + "learning_rate": 9.950632443461579e-06, + "loss": -0.0006543617229908705, + "num_tokens": 805277.0, + "reward": 0.5618750214576721, + "reward_std": 0.24448217451572418, + "rewards/env_game_reward/mean": 0.5618750214576721, + "rewards/env_game_reward/std": 0.3882100999355316, + "sampling/importance_sampling_ratio/max": 1.7909126281738281, + "sampling/importance_sampling_ratio/mean": 0.9404440999031067, + "sampling/importance_sampling_ratio/min": 0.09320367276668548, + "sampling/sampling_logp_difference/max": 1.6501066446304322, + "sampling/sampling_logp_difference/mean": 0.14056455492973327, + "step": 60, + "step_time": 1.980496010000934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.014763931930065154, + "clip_ratio/low_min": 0.005882352963089943, + "clip_ratio/region_mean": 0.014763931930065154, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.8, + "completions/max_terminated_length": 365.8, + "completions/mean_length": 287.8875, + "completions/mean_terminated_length": 287.8875, + "completions/min_length": 208.4, + "completions/min_terminated_length": 208.4, + "entropy": 0.4251057833433151, + "epoch": 0.0052, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.2114567756652832, + "kl": 1.2128720700740814, + "learning_rate": 9.95062896693086e-06, + "loss": -0.0011168548837304116, + "num_tokens": 872623.0, + "reward": 0.6370000123977662, + "reward_std": 0.2870853543281555, + "rewards/env_game_reward/mean": 0.6370000123977662, + "rewards/env_game_reward/std": 0.4172030627727509, + "sampling/importance_sampling_ratio/max": 2.156681776046753, + "sampling/importance_sampling_ratio/mean": 0.9560775279998779, + "sampling/importance_sampling_ratio/min": 0.20165933668613434, + "sampling/sampling_logp_difference/max": 1.7227527379989624, + "sampling/sampling_logp_difference/mean": 0.12174170911312103, + "step": 65, + "step_time": 2.1264499234013785 + }, + { + "clip_ratio/high_max": 0.01666666679084301, + "clip_ratio/high_mean": 0.008333333395421505, + "clip_ratio/low_mean": 0.009027777798473835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.017361111007630825, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.2, + "completions/max_terminated_length": 357.2, + "completions/mean_length": 284.3625, + "completions/mean_terminated_length": 284.3625, + "completions/min_length": 201.6, + "completions/min_terminated_length": 201.6, + "entropy": 0.3628254383802414, + "epoch": 0.0056, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.21124207973480225, + "kl": 0.6694609820842743, + "learning_rate": 9.950624834452351e-06, + "loss": 0.0007115081883966923, + "num_tokens": 939127.0, + "reward": 0.7271250009536743, + "reward_std": 0.28690858632326127, + "rewards/env_game_reward/mean": 0.7271250009536743, + "rewards/env_game_reward/std": 0.3982576608657837, + "sampling/importance_sampling_ratio/max": 2.1042640209198, + "sampling/importance_sampling_ratio/mean": 1.001141333580017, + "sampling/importance_sampling_ratio/min": 0.19583375304937362, + "sampling/sampling_logp_difference/max": 1.3206945657730103, + "sampling/sampling_logp_difference/mean": 0.11073778569698334, + "step": 70, + "step_time": 1.9332619735985646 + }, + { + "clip_ratio/high_max": 0.00625, + "clip_ratio/high_mean": 0.003125, + "clip_ratio/low_mean": 0.011388305388391019, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014513305388391018, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.4, + "completions/max_terminated_length": 365.4, + "completions/mean_length": 276.2, + "completions/mean_terminated_length": 276.2, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.26720436066389086, + "epoch": 0.006, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.09686678647994995, + "kl": 0.8629033237695694, + "learning_rate": 9.950620046026782e-06, + "loss": -0.0006346371024847031, + "num_tokens": 1004385.0, + "reward": 0.8097500324249267, + "reward_std": 0.25491200387477875, + "rewards/env_game_reward/mean": 0.8097500324249267, + "rewards/env_game_reward/std": 0.37631983757019044, + "sampling/importance_sampling_ratio/max": 1.5446483612060546, + "sampling/importance_sampling_ratio/mean": 0.8950366854667664, + "sampling/importance_sampling_ratio/min": 0.13836232647299768, + "sampling/sampling_logp_difference/max": 1.9488806009292603, + "sampling/sampling_logp_difference/mean": 0.12367857545614243, + "step": 75, + "step_time": 1.9649908402003349 + }, + { + "epoch": 0.006, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 360.3333333333333, + "eval_completions/max_terminated_length": 360.3333333333333, + "eval_completions/mean_length": 318.0416666666667, + "eval_completions/mean_terminated_length": 318.0416666666667, + "eval_completions/min_length": 249.33333333333334, + "eval_completions/min_terminated_length": 249.33333333333334, + "eval_entropy": 0.24031941095987955, + "eval_frac_reward_zero_std": 0.3333333333333333, + "eval_kl": 0.7372705638408661, + "eval_loss": -0.0011363811790943146, + "eval_num_tokens": 1004385.0, + "eval_reward": 0.8250000476837158, + "eval_reward_std": 0.3181980550289154, + "eval_rewards/env_game_reward/mean": 0.8250000476837158, + "eval_rewards/env_game_reward/std": 0.3558244506518046, + "eval_runtime": 2.1212, + "eval_samples_per_second": 4.714, + "eval_sampling/importance_sampling_ratio/max": 1.3695820967356365, + "eval_sampling/importance_sampling_ratio/mean": 0.9882725675900778, + "eval_sampling/importance_sampling_ratio/min": 0.5392542531092962, + "eval_sampling/sampling_logp_difference/max": 0.9286821683247884, + "eval_sampling/sampling_logp_difference/mean": 0.05970543374617895, + "eval_steps_per_second": 0.943, + "step": 75 + }, + { + "clip_ratio/high_max": 0.01702786386013031, + "clip_ratio/high_mean": 0.008513931930065156, + "clip_ratio/low_mean": 0.013888888992369175, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.022402821108698845, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.2, + "completions/max_terminated_length": 374.2, + "completions/mean_length": 304.9, + "completions/mean_terminated_length": 304.9, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.1878312572836876, + "epoch": 0.0064, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.06498023867607117, + "kl": 0.9341357827186585, + "learning_rate": 9.950614601654993e-06, + "loss": -0.001060234196484089, + "num_tokens": 1074730.0, + "reward": 0.8025000095367432, + "reward_std": 0.3075914442539215, + "rewards/env_game_reward/mean": 0.8025000095367432, + "rewards/env_game_reward/std": 0.3941896140575409, + "sampling/importance_sampling_ratio/max": 1.4447970628738402, + "sampling/importance_sampling_ratio/mean": 0.8254009127616883, + "sampling/importance_sampling_ratio/min": 0.03923565149307251, + "sampling/sampling_logp_difference/max": 2.3992530345916747, + "sampling/sampling_logp_difference/mean": 0.11771672368049621, + "step": 80, + "step_time": 1.982058087000769 + }, + { + "clip_ratio/high_max": 0.00625, + "clip_ratio/high_mean": 0.003125, + "clip_ratio/low_mean": 0.011638931930065155, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014763931930065154, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.6, + "completions/max_terminated_length": 374.6, + "completions/mean_length": 306.2875, + "completions/mean_terminated_length": 306.2875, + "completions/min_length": 219.2, + "completions/min_terminated_length": 219.2, + "entropy": 0.16379451900720596, + "epoch": 0.0068, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.05989319086074829, + "kl": 1.268787795305252, + "learning_rate": 9.950608501337942e-06, + "loss": -0.0012372495606541634, + "num_tokens": 1144439.0, + "reward": 0.8847500324249268, + "reward_std": 0.29733840227127073, + "rewards/env_game_reward/mean": 0.8847500324249268, + "rewards/env_game_reward/std": 0.33870106339454653, + "sampling/importance_sampling_ratio/max": 1.8493404388427734, + "sampling/importance_sampling_ratio/mean": 0.9314864993095398, + "sampling/importance_sampling_ratio/min": 0.13991991989314556, + "sampling/sampling_logp_difference/max": 1.9317576885223389, + "sampling/sampling_logp_difference/mean": 0.10915455222129822, + "step": 85, + "step_time": 2.131227758199384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002777777798473835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002777777798473835, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.2, + "completions/max_terminated_length": 374.2, + "completions/mean_length": 289.9125, + "completions/mean_terminated_length": 289.9125, + "completions/min_length": 227.2, + "completions/min_terminated_length": 227.2, + "entropy": 0.12536104992032052, + "epoch": 0.0072, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.20048366487026215, + "kl": 1.6958665192127227, + "learning_rate": 9.9506017450767e-06, + "loss": -0.0010345380753278733, + "num_tokens": 1211601.0, + "reward": 0.8850000381469727, + "reward_std": 0.3181980520486832, + "rewards/env_game_reward/mean": 0.8850000381469727, + "rewards/env_game_reward/std": 0.39857959747314453, + "sampling/importance_sampling_ratio/max": 1.9469106435775756, + "sampling/importance_sampling_ratio/mean": 0.9089800357818604, + "sampling/importance_sampling_ratio/min": 0.10673718531616032, + "sampling/sampling_logp_difference/max": 3.3863435983657837, + "sampling/sampling_logp_difference/mean": 0.1388186126947403, + "step": 90, + "step_time": 1.9693921780009986 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.019384890049695968, + "clip_ratio/low_min": 0.005000000074505806, + "clip_ratio/region_mean": 0.019384890049695968, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 302.6125, + "completions/mean_terminated_length": 302.6125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.1978470079600811, + "epoch": 0.0076, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.2384689301252365, + "kl": 1.4977002620697022, + "learning_rate": 9.950594332872455e-06, + "loss": -0.0007448500022292137, + "num_tokens": 1280837.0, + "reward": 0.7945000290870666, + "reward_std": 0.4037579774856567, + "rewards/env_game_reward/mean": 0.7945000290870666, + "rewards/env_game_reward/std": 0.4634205162525177, + "sampling/importance_sampling_ratio/max": 1.9967133045196532, + "sampling/importance_sampling_ratio/mean": 0.9287837624549866, + "sampling/importance_sampling_ratio/min": 0.0468483492732048, + "sampling/sampling_logp_difference/max": 2.3285855531692503, + "sampling/sampling_logp_difference/mean": 0.13745660185813904, + "step": 95, + "step_time": 1.966521529597958 + }, + { + "clip_ratio/high_max": 0.005882352963089943, + "clip_ratio/high_mean": 0.0029411764815449716, + "clip_ratio/low_mean": 0.020629085041582583, + "clip_ratio/low_min": 0.00555555559694767, + "clip_ratio/region_mean": 0.023570261523127557, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/max_terminated_length": 365.6, + "completions/mean_length": 294.075, + "completions/mean_terminated_length": 294.075, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.17654007077217101, + "epoch": 0.008, + "frac_reward_zero_std": 0.425, + "grad_norm": 0.13096670806407928, + "kl": 1.3037856757640838, + "learning_rate": 9.950586264726511e-06, + "loss": -0.0005890541709959507, + "num_tokens": 1348490.0, + "reward": 0.8925000071525574, + "reward_std": 0.2863782405853271, + "rewards/env_game_reward/mean": 0.8925000071525574, + "rewards/env_game_reward/std": 0.37230696678161623, + "sampling/importance_sampling_ratio/max": 1.8284294366836549, + "sampling/importance_sampling_ratio/mean": 0.8878485441207886, + "sampling/importance_sampling_ratio/min": 0.049935894832015035, + "sampling/sampling_logp_difference/max": 2.5030375480651856, + "sampling/sampling_logp_difference/mean": 0.14053128361701966, + "step": 100, + "step_time": 1.9532180378009798 + }, + { + "clip_ratio/high_max": 0.005882352963089943, + "clip_ratio/high_mean": 0.0029411764815449716, + "clip_ratio/low_mean": 0.014454334415495396, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01739551089704037, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.2, + "completions/max_terminated_length": 373.2, + "completions/mean_length": 282.025, + "completions/mean_terminated_length": 282.025, + "completions/min_length": 206.8, + "completions/min_terminated_length": 206.8, + "entropy": 0.1877336472272873, + "epoch": 0.0084, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.13922861218452454, + "kl": 1.016879215836525, + "learning_rate": 9.950577540640286e-06, + "loss": -0.0011088012717664242, + "num_tokens": 1414612.0, + "reward": 0.8173750281333924, + "reward_std": 0.24412862062454224, + "rewards/env_game_reward/mean": 0.8173750281333924, + "rewards/env_game_reward/std": 0.39675052762031554, + "sampling/importance_sampling_ratio/max": 2.0766624450683593, + "sampling/importance_sampling_ratio/mean": 0.9333486676216125, + "sampling/importance_sampling_ratio/min": 0.17753537595272065, + "sampling/sampling_logp_difference/max": 1.8823954582214355, + "sampling/sampling_logp_difference/mean": 0.13100463151931763, + "step": 105, + "step_time": 2.1976747798005816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.01445433460175991, + "clip_ratio/low_min": 0.005263157933950424, + "clip_ratio/region_mean": 0.01445433460175991, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.8, + "completions/max_terminated_length": 365.8, + "completions/mean_length": 284.425, + "completions/mean_terminated_length": 284.425, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.1360536314547062, + "epoch": 0.0088, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.03397221490740776, + "kl": 1.4786522924900054, + "learning_rate": 9.950568160615312e-06, + "loss": -0.0008132285438477993, + "num_tokens": 1481189.0, + "reward": 0.9673750162124634, + "reward_std": 0.22291541397571563, + "rewards/env_game_reward/mean": 0.9673750162124634, + "rewards/env_game_reward/std": 0.3230068266391754, + "sampling/importance_sampling_ratio/max": 2.0664987325668336, + "sampling/importance_sampling_ratio/mean": 0.9952835202217102, + "sampling/importance_sampling_ratio/min": 0.21388899087905883, + "sampling/sampling_logp_difference/max": 1.6557833194732665, + "sampling/sampling_logp_difference/mean": 0.10078478828072548, + "step": 110, + "step_time": 1.9459437828008959 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.008099906705319881, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008099906705319881, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/max_terminated_length": 365.6, + "completions/mean_length": 299.8625, + "completions/mean_terminated_length": 299.8625, + "completions/min_length": 249.6, + "completions/min_terminated_length": 249.6, + "entropy": 0.0922572823241353, + "epoch": 0.0092, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.02898675948381424, + "kl": 1.3785052359104157, + "learning_rate": 9.950558124653239e-06, + "loss": -0.0016875043511390686, + "num_tokens": 1549583.0, + "reward": 0.9673750400543213, + "reward_std": 0.20170221328735352, + "rewards/env_game_reward/mean": 0.9673750400543213, + "rewards/env_game_reward/std": 0.31243913173675536, + "sampling/importance_sampling_ratio/max": 1.452053427696228, + "sampling/importance_sampling_ratio/mean": 0.8291665196418763, + "sampling/importance_sampling_ratio/min": 0.06903318837285041, + "sampling/sampling_logp_difference/max": 2.5763232707977295, + "sampling/sampling_logp_difference/mean": 0.12499624639749526, + "step": 115, + "step_time": 2.0123212985985446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.011785130575299263, + "clip_ratio/low_min": 0.005882352963089943, + "clip_ratio/region_mean": 0.011785130575299263, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 288.425, + "completions/mean_terminated_length": 288.425, + "completions/min_length": 227.4, + "completions/min_terminated_length": 227.4, + "entropy": 0.07345340847969055, + "epoch": 0.0096, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.06546451151371002, + "kl": 1.1702515691518784, + "learning_rate": 9.950547432755832e-06, + "loss": -0.0007662988267838955, + "num_tokens": 1616803.0, + "reward": 1.0573750376701354, + "reward_std": 0.18013544976711274, + "rewards/env_game_reward/mean": 1.0573750376701354, + "rewards/env_game_reward/std": 0.2634139180183411, + "sampling/importance_sampling_ratio/max": 1.988950991630554, + "sampling/importance_sampling_ratio/mean": 0.9787555813789368, + "sampling/importance_sampling_ratio/min": 0.09628211334347725, + "sampling/sampling_logp_difference/max": 2.1504470825195314, + "sampling/sampling_logp_difference/mean": 0.08526455238461494, + "step": 120, + "step_time": 1.9904663407985936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.015257352963089943, + "clip_ratio/low_min": 0.00625, + "clip_ratio/region_mean": 0.015257352963089943, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.2, + "completions/max_terminated_length": 349.2, + "completions/mean_length": 267.2875, + "completions/mean_terminated_length": 267.2875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.07882922235876322, + "epoch": 0.01, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.45003291964530945, + "kl": 4.5989465653896335, + "learning_rate": 9.950536084924971e-06, + "loss": -0.0006290416233241559, + "num_tokens": 1679963.0, + "reward": 1.0725000381469727, + "reward_std": 0.1590990275144577, + "rewards/env_game_reward/mean": 1.0725000381469727, + "rewards/env_game_reward/std": 0.24540797472000123, + "sampling/importance_sampling_ratio/max": 1.6149428606033325, + "sampling/importance_sampling_ratio/mean": 0.9134042382240295, + "sampling/importance_sampling_ratio/min": 0.06451723147183656, + "sampling/sampling_logp_difference/max": 2.553125739097595, + "sampling/sampling_logp_difference/mean": 0.11119775697588921, + "step": 125, + "step_time": 1.9727394509995064 + }, + { + "clip_ratio/high_max": 0.00625, + "clip_ratio/high_mean": 0.003125, + "clip_ratio/low_mean": 0.002777777798473835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005902777798473835, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.6, + "completions/max_terminated_length": 373.6, + "completions/mean_length": 279.75, + "completions/mean_terminated_length": 279.75, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "entropy": 0.11642275676131249, + "epoch": 0.0104, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.03156660497188568, + "kl": 0.8787374496459961, + "learning_rate": 9.950524081162648e-06, + "loss": -0.00015998464077711106, + "num_tokens": 1745793.0, + "reward": 1.0725000381469727, + "reward_std": 0.09545941650867462, + "rewards/env_game_reward/mean": 1.0725000381469727, + "rewards/env_game_reward/std": 0.27027024030685426, + "sampling/importance_sampling_ratio/max": 1.5973956346511842, + "sampling/importance_sampling_ratio/mean": 1.051470685005188, + "sampling/importance_sampling_ratio/min": 0.41294346153736117, + "sampling/sampling_logp_difference/max": 1.1177088975906373, + "sampling/sampling_logp_difference/mean": 0.04818851538002491, + "step": 130, + "step_time": 2.081850044800376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.011513157933950424, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.011513157933950424, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.8, + "completions/max_terminated_length": 365.8, + "completions/mean_length": 277.775, + "completions/mean_terminated_length": 277.775, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.16472503766417504, + "epoch": 0.0108, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.04943538084626198, + "kl": 0.9097718864679336, + "learning_rate": 9.950511421470975e-06, + "loss": -0.0013091729953885079, + "num_tokens": 1811214.0, + "reward": 0.937375009059906, + "reward_std": 0.3077682375907898, + "rewards/env_game_reward/mean": 0.937375009059906, + "rewards/env_game_reward/std": 0.3747471272945404, + "sampling/importance_sampling_ratio/max": 1.3492722034454345, + "sampling/importance_sampling_ratio/mean": 0.8988288879394531, + "sampling/importance_sampling_ratio/min": 0.22225932404398918, + "sampling/sampling_logp_difference/max": 1.6521860837936402, + "sampling/sampling_logp_difference/mean": 0.08873736709356309, + "step": 135, + "step_time": 1.9900708778011904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.01472630724310875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.01472630724310875, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.8, + "completions/max_terminated_length": 373.8, + "completions/mean_length": 284.375, + "completions/mean_terminated_length": 284.375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.1683448150753975, + "epoch": 0.0112, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.055281057953834534, + "kl": 0.8311902970075608, + "learning_rate": 9.950498105852176e-06, + "loss": -0.0005867025814950466, + "num_tokens": 1877869.0, + "reward": 0.9825000524520874, + "reward_std": 0.2863782525062561, + "rewards/env_game_reward/mean": 0.9825000524520874, + "rewards/env_game_reward/std": 0.3745770752429962, + "sampling/importance_sampling_ratio/max": 1.7486498355865479, + "sampling/importance_sampling_ratio/mean": 1.0248434662818908, + "sampling/importance_sampling_ratio/min": 0.2848562225699425, + "sampling/sampling_logp_difference/max": 1.237773633003235, + "sampling/sampling_logp_difference/mean": 0.06331658810377121, + "step": 140, + "step_time": 1.9763025989996095 + }, + { + "clip_ratio/high_max": 0.005882352963089943, + "clip_ratio/high_mean": 0.0029411764815449716, + "clip_ratio/low_mean": 0.005718954280018807, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008660130761563778, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.6, + "completions/max_terminated_length": 373.6, + "completions/mean_length": 301.7375, + "completions/mean_terminated_length": 301.7375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.16710587963461876, + "epoch": 0.0116, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.04859064891934395, + "kl": 0.8752860724925995, + "learning_rate": 9.950484134308594e-06, + "loss": -0.0005891127046197652, + "num_tokens": 1947235.0, + "reward": 0.9448750495910645, + "reward_std": 0.2335220217704773, + "rewards/env_game_reward/mean": 0.9448750495910645, + "rewards/env_game_reward/std": 0.3758414626121521, + "sampling/importance_sampling_ratio/max": 1.7692456722259522, + "sampling/importance_sampling_ratio/mean": 0.964184021949768, + "sampling/importance_sampling_ratio/min": 0.2673730432987213, + "sampling/sampling_logp_difference/max": 1.0289928674697877, + "sampling/sampling_logp_difference/mean": 0.07320068627595902, + "step": 145, + "step_time": 2.0029842743999327 + }, + { + "clip_ratio/high_max": 0.005882352963089943, + "clip_ratio/high_mean": 0.0029411764815449716, + "clip_ratio/low_mean": 0.01180555559694767, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.014746732078492641, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/max_terminated_length": 365.6, + "completions/mean_length": 281.6375, + "completions/mean_terminated_length": 281.6375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.11515891700983047, + "epoch": 0.012, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.029918797314167023, + "kl": 2.9318640172481536, + "learning_rate": 9.950469506842683e-06, + "loss": -0.0018826086074113847, + "num_tokens": 2012897.0, + "reward": 0.9900000214576721, + "reward_std": 0.23334523737430574, + "rewards/env_game_reward/mean": 0.9900000214576721, + "rewards/env_game_reward/std": 0.333386093378067, + "sampling/importance_sampling_ratio/max": 1.7696414470672608, + "sampling/importance_sampling_ratio/mean": 0.9344987154006958, + "sampling/importance_sampling_ratio/min": 0.15813518241047858, + "sampling/sampling_logp_difference/max": 1.9756666660308837, + "sampling/sampling_logp_difference/mean": 0.10830660909414291, + "step": 150, + "step_time": 2.127201033801248 + }, + { + "epoch": 0.012, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 360.3333333333333, + "eval_completions/max_terminated_length": 360.3333333333333, + "eval_completions/mean_length": 317.9583333333333, + "eval_completions/mean_terminated_length": 317.9583333333333, + "eval_completions/min_length": 249.33333333333334, + "eval_completions/min_terminated_length": 249.33333333333334, + "eval_entropy": 0.13987144331137338, + "eval_frac_reward_zero_std": 0.5833333333333334, + "eval_kl": 0.781950831413269, + "eval_loss": -0.0002963352599181235, + "eval_num_tokens": 2012897.0, + "eval_reward": 1.00000003973643, + "eval_reward_std": 0.2121320366859436, + "eval_rewards/env_game_reward/mean": 1.00000003973643, + "eval_rewards/env_game_reward/std": 0.34489662448565167, + "eval_runtime": 2.1427, + "eval_samples_per_second": 4.667, + "eval_sampling/importance_sampling_ratio/max": 1.6415756543477376, + "eval_sampling/importance_sampling_ratio/mean": 0.9771247307459513, + "eval_sampling/importance_sampling_ratio/min": 0.4607134858767192, + "eval_sampling/sampling_logp_difference/max": 1.1493730743726094, + "eval_sampling/sampling_logp_difference/mean": 0.07872161269187927, + "eval_steps_per_second": 0.933, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0029411764815449716, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029411764815449716, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.2, + "completions/max_terminated_length": 365.2, + "completions/mean_length": 280.625, + "completions/mean_terminated_length": 280.625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.13695140182971954, + "epoch": 0.0124, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.02835434302687645, + "kl": 0.8980278909206391, + "learning_rate": 9.950454223457017e-06, + "loss": -0.0009985696524381638, + "num_tokens": 2078097.0, + "reward": 1.0575000286102294, + "reward_std": 0.13788582384586334, + "rewards/env_game_reward/mean": 1.0575000286102294, + "rewards/env_game_reward/std": 0.2769304394721985, + "sampling/importance_sampling_ratio/max": 1.9645647525787353, + "sampling/importance_sampling_ratio/mean": 1.053748869895935, + "sampling/importance_sampling_ratio/min": 0.1966949909925461, + "sampling/sampling_logp_difference/max": 1.4146570682525634, + "sampling/sampling_logp_difference/mean": 0.0750476747751236, + "step": 155, + "step_time": 1.9448262721984064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0029411764815449716, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029411764815449716, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.6, + "completions/max_terminated_length": 373.6, + "completions/mean_length": 287.6125, + "completions/mean_terminated_length": 287.6125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.09548097178339958, + "epoch": 0.0128, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.016875844448804855, + "kl": 0.9205106794834137, + "learning_rate": 9.95043828415428e-06, + "loss": -0.0003811331000179052, + "num_tokens": 2145572.0, + "reward": 1.1175000429153443, + "reward_std": 0.09545941650867462, + "rewards/env_game_reward/mean": 1.1175000429153443, + "rewards/env_game_reward/std": 0.21992212533950806, + "sampling/importance_sampling_ratio/max": 1.873227596282959, + "sampling/importance_sampling_ratio/mean": 1.035505485534668, + "sampling/importance_sampling_ratio/min": 0.2760059699416161, + "sampling/sampling_logp_difference/max": 1.6244861602783203, + "sampling/sampling_logp_difference/mean": 0.055689787119627, + "step": 160, + "step_time": 1.9473863302002428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.008204334415495396, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008204334415495396, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/max_terminated_length": 365.6, + "completions/mean_length": 293.95, + "completions/mean_terminated_length": 293.95, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "entropy": 0.09301564693450928, + "epoch": 0.0132, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.03346140682697296, + "kl": 1.1773792251944541, + "learning_rate": 9.950421688937273e-06, + "loss": -0.0007360159419476986, + "num_tokens": 2213963.0, + "reward": 1.0425000429153441, + "reward_std": 0.20152542889118194, + "rewards/env_game_reward/mean": 1.0425000429153441, + "rewards/env_game_reward/std": 0.3081572115421295, + "sampling/importance_sampling_ratio/max": 1.4281560897827148, + "sampling/importance_sampling_ratio/mean": 0.9599952220916748, + "sampling/importance_sampling_ratio/min": 0.2669469267129898, + "sampling/sampling_logp_difference/max": 1.390862488746643, + "sampling/sampling_logp_difference/mean": 0.055699700862169264, + "step": 165, + "step_time": 1.9470213165986934 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.009027777798473835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009027777798473835, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.6, + "completions/max_terminated_length": 373.6, + "completions/mean_length": 284.475, + "completions/mean_terminated_length": 284.475, + "completions/min_length": 218.8, + "completions/min_terminated_length": 218.8, + "entropy": 0.07658235654234886, + "epoch": 0.0136, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.03937648609280586, + "kl": 1.81198593378067, + "learning_rate": 9.950404437808918e-06, + "loss": -0.0011821023188531398, + "num_tokens": 2279225.0, + "reward": 1.0500000238418579, + "reward_std": 0.19091882705688476, + "rewards/env_game_reward/mean": 1.0500000238418579, + "rewards/env_game_reward/std": 0.3320326149463654, + "sampling/importance_sampling_ratio/max": 1.477155590057373, + "sampling/importance_sampling_ratio/mean": 0.9458814024925232, + "sampling/importance_sampling_ratio/min": 0.270341220498085, + "sampling/sampling_logp_difference/max": 1.6849958896636963, + "sampling/sampling_logp_difference/mean": 0.08588936105370522, + "step": 170, + "step_time": 2.1512292531995625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.4, + "completions/max_terminated_length": 365.4, + "completions/mean_length": 303.2, + "completions/mean_terminated_length": 303.2, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.09666889570653439, + "epoch": 0.014, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.04141394793987274, + "kl": 2.4968443870544434, + "learning_rate": 9.950386530772241e-06, + "loss": -0.00040261512622237207, + "num_tokens": 2348233.0, + "reward": 1.1023750305175781, + "reward_std": 0.11684939563274384, + "rewards/env_game_reward/mean": 1.1023750305175781, + "rewards/env_game_reward/std": 0.21312530040740968, + "sampling/importance_sampling_ratio/max": 1.6463705539703368, + "sampling/importance_sampling_ratio/mean": 1.0285760641098023, + "sampling/importance_sampling_ratio/min": 0.4818965196609497, + "sampling/sampling_logp_difference/max": 0.8949426889419556, + "sampling/sampling_logp_difference/mean": 0.04998132698237896, + "step": 175, + "step_time": 2.019259786400653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.2, + "completions/max_terminated_length": 366.2, + "completions/mean_length": 292.625, + "completions/mean_terminated_length": 292.625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.11000263169407845, + "epoch": 0.0144, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.014517554081976414, + "kl": 3.3274164140224456, + "learning_rate": 9.950367967830395e-06, + "loss": -0.0005131676327437163, + "num_tokens": 2415877.0, + "reward": 1.0423750162124634, + "reward_std": 0.22291540503501892, + "rewards/env_game_reward/mean": 1.0423750162124634, + "rewards/env_game_reward/std": 0.3507106065750122, + "sampling/importance_sampling_ratio/max": 1.5298500537872315, + "sampling/importance_sampling_ratio/mean": 0.9435723781585693, + "sampling/importance_sampling_ratio/min": 0.327379421889782, + "sampling/sampling_logp_difference/max": 1.2668209314346313, + "sampling/sampling_logp_difference/mean": 0.06219653338193894, + "step": 180, + "step_time": 1.9583896137977717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002631578966975212, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002631578966975212, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.2, + "completions/max_terminated_length": 374.2, + "completions/mean_length": 281.1875, + "completions/mean_terminated_length": 281.1875, + "completions/min_length": 199.6, + "completions/min_terminated_length": 199.6, + "entropy": 0.12678153738379477, + "epoch": 0.0148, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.016833849251270294, + "kl": 1.2115282833576202, + "learning_rate": 9.950348748986642e-06, + "loss": -0.000427194032818079, + "num_tokens": 2481693.0, + "reward": 1.0798749923706055, + "reward_std": 0.16988240480422973, + "rewards/env_game_reward/mean": 1.0798749923706055, + "rewards/env_game_reward/std": 0.2858531653881073, + "sampling/importance_sampling_ratio/max": 1.3460319757461547, + "sampling/importance_sampling_ratio/mean": 0.9750611186027527, + "sampling/importance_sampling_ratio/min": 0.32110539078712463, + "sampling/sampling_logp_difference/max": 1.2185750484466553, + "sampling/sampling_logp_difference/mean": 0.04203767627477646, + "step": 185, + "step_time": 2.0135145276013646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.009007352963089943, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.009007352963089943, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.2, + "completions/max_terminated_length": 373.2, + "completions/mean_length": 284.75, + "completions/mean_terminated_length": 284.75, + "completions/min_length": 219.2, + "completions/min_terminated_length": 219.2, + "entropy": 0.17685981318354607, + "epoch": 0.0152, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.02496095560491085, + "kl": 1.321517664194107, + "learning_rate": 9.950328874244359e-06, + "loss": -0.0008681080304086209, + "num_tokens": 2548402.0, + "reward": 1.0200000524520874, + "reward_std": 0.23334523439407348, + "rewards/env_game_reward/mean": 1.0200000524520874, + "rewards/env_game_reward/std": 0.350832587480545, + "sampling/importance_sampling_ratio/max": 1.5548951625823975, + "sampling/importance_sampling_ratio/mean": 0.9757178544998169, + "sampling/importance_sampling_ratio/min": 0.40361145734786985, + "sampling/sampling_logp_difference/max": 0.9451999545097352, + "sampling/sampling_logp_difference/mean": 0.04799589850008488, + "step": 190, + "step_time": 2.1438089131981544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002777777798473835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002777777798473835, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.6, + "completions/max_terminated_length": 357.6, + "completions/mean_length": 278.6, + "completions/mean_terminated_length": 278.6, + "completions/min_length": 206.8, + "completions/min_terminated_length": 206.8, + "entropy": 0.16943402960896492, + "epoch": 0.0156, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.04945311322808266, + "kl": 0.8556254506111145, + "learning_rate": 9.950308343607042e-06, + "loss": -0.00024995713029056786, + "num_tokens": 2614104.0, + "reward": 1.124875020980835, + "reward_std": 0.1062427967786789, + "rewards/env_game_reward/mean": 1.124875020980835, + "rewards/env_game_reward/std": 0.2150747537612915, + "sampling/importance_sampling_ratio/max": 1.4159629583358764, + "sampling/importance_sampling_ratio/mean": 1.027950894832611, + "sampling/importance_sampling_ratio/min": 0.4427237957715988, + "sampling/sampling_logp_difference/max": 1.0141066074371339, + "sampling/sampling_logp_difference/mean": 0.034100130572915076, + "step": 195, + "step_time": 1.9601369298005011 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0029411764815449716, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029411764815449716, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.8, + "completions/max_terminated_length": 373.8, + "completions/mean_length": 292.3, + "completions/mean_terminated_length": 292.3, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.12501999139785766, + "epoch": 0.016, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.035976510494947433, + "kl": 1.643832701444626, + "learning_rate": 9.950287157078299e-06, + "loss": -3.578556934371591e-05, + "num_tokens": 2681768.0, + "reward": 1.1175000429153443, + "reward_std": 0.09545941650867462, + "rewards/env_game_reward/mean": 1.1175000429153443, + "rewards/env_game_reward/std": 0.1825194239616394, + "sampling/importance_sampling_ratio/max": 1.2619178771972657, + "sampling/importance_sampling_ratio/mean": 0.9969248056411744, + "sampling/importance_sampling_ratio/min": 0.6032956957817077, + "sampling/sampling_logp_difference/max": 0.722657385468483, + "sampling/sampling_logp_difference/mean": 0.025445305183529852, + "step": 200, + "step_time": 2.0157902724015004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.003125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.8, + "completions/max_terminated_length": 365.8, + "completions/mean_length": 287.55, + "completions/mean_terminated_length": 287.55, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.10517201647162437, + "epoch": 0.0164, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.645147318020463e-05, + "kl": 0.9572436332702636, + "learning_rate": 9.950265314661852e-06, + "loss": -0.0006879229098558426, + "num_tokens": 2748656.0, + "reward": 1.0875000596046447, + "reward_std": 0.11667262017726898, + "rewards/env_game_reward/mean": 1.0875000596046447, + "rewards/env_game_reward/std": 0.22443267107009887, + "sampling/importance_sampling_ratio/max": 1.411534857749939, + "sampling/importance_sampling_ratio/mean": 0.97565096616745, + "sampling/importance_sampling_ratio/min": 0.28964664936065676, + "sampling/sampling_logp_difference/max": 0.8600339412689209, + "sampling/sampling_logp_difference/mean": 0.03780459091067314, + "step": 205, + "step_time": 1.9924305702013954 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002777777798473835, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002777777798473835, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.6, + "completions/max_terminated_length": 365.6, + "completions/mean_length": 289.425, + "completions/mean_terminated_length": 289.425, + "completions/min_length": 225.8, + "completions/min_terminated_length": 225.8, + "entropy": 0.06622385941445827, + "epoch": 0.0168, + "frac_reward_zero_std": 0.9, + "grad_norm": 3.81359423045069e-05, + "kl": 1.2422339260578155, + "learning_rate": 9.950242816361542e-06, + "loss": -4.801744944415987e-06, + "num_tokens": 2816147.0, + "reward": 1.1625000476837157, + "reward_std": 0.0530330091714859, + "rewards/env_game_reward/mean": 1.1625000476837157, + "rewards/env_game_reward/std": 0.12526867985725404, + "sampling/importance_sampling_ratio/max": 1.1708146333694458, + "sampling/importance_sampling_ratio/mean": 1.0134225606918335, + "sampling/importance_sampling_ratio/min": 0.794332218170166, + "sampling/sampling_logp_difference/max": 0.3431601271033287, + "sampling/sampling_logp_difference/mean": 0.01179146794602275, + "step": 210, + "step_time": 2.072417707799468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002631578966975212, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002631578966975212, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.6, + "completions/max_terminated_length": 357.6, + "completions/mean_length": 279.8375, + "completions/mean_terminated_length": 279.8375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.09221492633223534, + "epoch": 0.0172, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.019924040883779526, + "kl": 6.042136391997337, + "learning_rate": 9.950219662181327e-06, + "loss": 0.0002026589121669531, + "num_tokens": 2882162.0, + "reward": 1.0723750352859498, + "reward_std": 0.13806260228157044, + "rewards/env_game_reward/mean": 1.0723750352859498, + "rewards/env_game_reward/std": 0.2875085473060608, + "sampling/importance_sampling_ratio/max": 1.1406650304794312, + "sampling/importance_sampling_ratio/mean": 0.9631157875061035, + "sampling/importance_sampling_ratio/min": 0.5338585242629051, + "sampling/sampling_logp_difference/max": 0.9422056198120117, + "sampling/sampling_logp_difference/mean": 0.038619527220726015, + "step": 215, + "step_time": 2.0965528106011333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.008333333395421505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008333333395421505, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.6, + "completions/max_terminated_length": 373.6, + "completions/mean_length": 296.625, + "completions/mean_terminated_length": 296.625, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.209561850130558, + "epoch": 0.0176, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.025828201323747635, + "kl": 1.7181314051151275, + "learning_rate": 9.950195852125273e-06, + "loss": -0.00037424759939312934, + "num_tokens": 2950140.0, + "reward": 1.0123750209808349, + "reward_std": 0.20170221328735352, + "rewards/env_game_reward/mean": 1.0123750209808349, + "rewards/env_game_reward/std": 0.35673150420188904, + "sampling/importance_sampling_ratio/max": 1.5680360555648805, + "sampling/importance_sampling_ratio/mean": 0.978592324256897, + "sampling/importance_sampling_ratio/min": 0.29607744626700877, + "sampling/sampling_logp_difference/max": 1.106997287273407, + "sampling/sampling_logp_difference/mean": 0.05600374937057495, + "step": 220, + "step_time": 2.013816408000275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.8, + "completions/max_terminated_length": 365.8, + "completions/mean_length": 283.4125, + "completions/mean_terminated_length": 283.4125, + "completions/min_length": 219.2, + "completions/min_terminated_length": 219.2, + "entropy": 0.19863322675228118, + "epoch": 0.018, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.0762995183467865, + "kl": 1.3156237244606017, + "learning_rate": 9.95017138619757e-06, + "loss": 9.536017896607518e-05, + "num_tokens": 3015791.0, + "reward": 1.0125000596046447, + "reward_std": 0.22273863554000856, + "rewards/env_game_reward/mean": 1.0125000596046447, + "rewards/env_game_reward/std": 0.3335274219512939, + "sampling/importance_sampling_ratio/max": 1.5515583753585815, + "sampling/importance_sampling_ratio/mean": 1.00895494222641, + "sampling/importance_sampling_ratio/min": 0.5095189124345779, + "sampling/sampling_logp_difference/max": 0.8763931155204773, + "sampling/sampling_logp_difference/mean": 0.040667933598160746, + "step": 225, + "step_time": 2.018751011400309 + }, + { + "epoch": 0.018, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 360.0, + "eval_completions/max_terminated_length": 360.0, + "eval_completions/mean_length": 317.75, + "eval_completions/mean_terminated_length": 317.75, + "eval_completions/min_length": 249.33333333333334, + "eval_completions/min_terminated_length": 249.33333333333334, + "eval_entropy": 0.1765607347091039, + "eval_frac_reward_zero_std": 0.4166666666666667, + "eval_kl": 1.3525162537892659, + "eval_loss": -0.0003587036917451769, + "eval_num_tokens": 3015791.0, + "eval_reward": 0.9500000476837158, + "eval_reward_std": 0.3535533944765727, + "eval_rewards/env_game_reward/mean": 0.9500000476837158, + "eval_rewards/env_game_reward/std": 0.39218372106552124, + "eval_runtime": 2.0879, + "eval_samples_per_second": 4.789, + "eval_sampling/importance_sampling_ratio/max": 1.2104533513387044, + "eval_sampling/importance_sampling_ratio/mean": 0.9633649388949076, + "eval_sampling/importance_sampling_ratio/min": 0.3959500590960185, + "eval_sampling/sampling_logp_difference/max": 1.0513638655344646, + "eval_sampling/sampling_logp_difference/mean": 0.04657792175809542, + "eval_steps_per_second": 0.958, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002631578966975212, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002631578966975212, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.4, + "completions/max_terminated_length": 374.4, + "completions/mean_length": 303.125, + "completions/mean_terminated_length": 303.125, + "completions/min_length": 234.4, + "completions/min_terminated_length": 234.4, + "entropy": 0.1315464749932289, + "epoch": 0.0184, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.017125917598605156, + "kl": 1.0924064695835114, + "learning_rate": 9.950146264402513e-06, + "loss": -0.0007199865765869617, + "num_tokens": 3085648.0, + "reward": 1.0575000405311585, + "reward_std": 0.2015254318714142, + "rewards/env_game_reward/mean": 1.0575000405311585, + "rewards/env_game_reward/std": 0.2932907700538635, + "sampling/importance_sampling_ratio/max": 1.2461576223373414, + "sampling/importance_sampling_ratio/mean": 0.9759754657745361, + "sampling/importance_sampling_ratio/min": 0.4818070411682129, + "sampling/sampling_logp_difference/max": 0.7828710556030274, + "sampling/sampling_logp_difference/mean": 0.033535952866077426, + "step": 230, + "step_time": 1.964708850999159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.003125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003125, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.6, + "completions/max_terminated_length": 357.6, + "completions/mean_length": 277.425, + "completions/mean_terminated_length": 277.425, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.0799025647342205, + "epoch": 0.0188, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.10008827596902847, + "kl": 2.224274104833603, + "learning_rate": 9.950120486744523e-06, + "loss": 8.720792829990387e-05, + "num_tokens": 3151112.0, + "reward": 1.0950000286102295, + "reward_std": 0.1484924226999283, + "rewards/env_game_reward/mean": 1.0950000286102295, + "rewards/env_game_reward/std": 0.2522334337234497, + "sampling/importance_sampling_ratio/max": 1.1456147193908692, + "sampling/importance_sampling_ratio/mean": 0.9904715538024902, + "sampling/importance_sampling_ratio/min": 0.647669506072998, + "sampling/sampling_logp_difference/max": 0.4812516301870346, + "sampling/sampling_logp_difference/mean": 0.01942737139761448, + "step": 235, + "step_time": 2.084714898400125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.005882352963089943, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005882352963089943, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.4, + "completions/max_terminated_length": 373.4, + "completions/mean_length": 293.65, + "completions/mean_terminated_length": 293.65, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.06182103715837002, + "epoch": 0.0192, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.014402960427105427, + "kl": 1.220539104938507, + "learning_rate": 9.950094053228128e-06, + "loss": -0.000241970201022923, + "num_tokens": 3218802.0, + "reward": 1.140000057220459, + "reward_std": 0.08485281467437744, + "rewards/env_game_reward/mean": 1.140000057220459, + "rewards/env_game_reward/std": 0.16455072164535522, + "sampling/importance_sampling_ratio/max": 1.1483745574951172, + "sampling/importance_sampling_ratio/mean": 1.0016549229621887, + "sampling/importance_sampling_ratio/min": 0.740897786617279, + "sampling/sampling_logp_difference/max": 0.4138609737157822, + "sampling/sampling_logp_difference/mean": 0.012947980128228665, + "step": 240, + "step_time": 1.9686758019997797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.2, + "completions/max_terminated_length": 374.2, + "completions/mean_length": 295.5375, + "completions/mean_terminated_length": 295.5375, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.06896210089325905, + "epoch": 0.0196, + "frac_reward_zero_std": 0.85, + "grad_norm": 3.613123408285901e-05, + "kl": 1.0657689243555069, + "learning_rate": 9.950066963857978e-06, + "loss": 5.23922499269247e-06, + "num_tokens": 3286788.0, + "reward": 1.139875054359436, + "reward_std": 0.08502959161996841, + "rewards/env_game_reward/mean": 1.139875054359436, + "rewards/env_game_reward/std": 0.1967564880847931, + "sampling/importance_sampling_ratio/max": 1.1366309642791748, + "sampling/importance_sampling_ratio/mean": 1.0147639751434325, + "sampling/importance_sampling_ratio/min": 0.8659112334251404, + "sampling/sampling_logp_difference/max": 0.27902010679244993, + "sampling/sampling_logp_difference/mean": 0.011635956121608615, + "step": 245, + "step_time": 1.9748669714004792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.4, + "completions/max_terminated_length": 365.4, + "completions/mean_length": 289.75, + "completions/mean_terminated_length": 289.75, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.05834418162703514, + "epoch": 0.02, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.10833176970481873, + "kl": 1.9543183267116546, + "learning_rate": 9.950039218638832e-06, + "loss": 7.4031762778759e-05, + "num_tokens": 3353652.0, + "reward": 1.1325000286102296, + "reward_std": 0.09545941650867462, + "rewards/env_game_reward/mean": 1.1325000286102296, + "rewards/env_game_reward/std": 0.22625648975372314, + "sampling/importance_sampling_ratio/max": 1.149832510948181, + "sampling/importance_sampling_ratio/mean": 0.9904601097106933, + "sampling/importance_sampling_ratio/min": 0.6639677166938782, + "sampling/sampling_logp_difference/max": 0.6797393321990967, + "sampling/sampling_logp_difference/mean": 0.017544577736407517, + "step": 250, + "step_time": 1.967866663800669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.001923076994717121, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001923076994717121, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 447.3875, + "completions/mean_terminated_length": 447.3875, + "completions/min_length": 344.8, + "completions/min_terminated_length": 344.8, + "entropy": 0.05960344485938549, + "epoch": 0.0204, + "frac_reward_zero_std": 0.725, + "grad_norm": 0.007863137871026993, + "kl": 1.9300155520439148, + "learning_rate": 9.95001081757557e-06, + "loss": -0.0004777685273438692, + "num_tokens": 3433435.0, + "reward": 1.1585000038146973, + "reward_std": 0.1282220296561718, + "rewards/env_game_reward/mean": 1.1585000038146973, + "rewards/env_game_reward/std": 0.2555761218070984, + "sampling/importance_sampling_ratio/max": 1.133116865158081, + "sampling/importance_sampling_ratio/mean": 0.9558210611343384, + "sampling/importance_sampling_ratio/min": 0.4158107668161392, + "sampling/sampling_logp_difference/max": 1.0225807905197144, + "sampling/sampling_logp_difference/mean": 0.02713311556726694, + "step": 255, + "step_time": 2.663749648000521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0018518518656492234, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018518518656492234, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.0, + "completions/max_terminated_length": 714.0, + "completions/mean_length": 577.2875, + "completions/mean_terminated_length": 577.2875, + "completions/min_length": 421.6, + "completions/min_terminated_length": 421.6, + "entropy": 0.10376667603850365, + "epoch": 0.0208, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.03462546318769455, + "kl": 1.5407811641693114, + "learning_rate": 9.94998176067318e-06, + "loss": 0.0002572691533714533, + "num_tokens": 3523855.0, + "reward": 1.0450416326522827, + "reward_std": 0.17671776488423346, + "rewards/env_game_reward/mean": 1.0450416326522827, + "rewards/env_game_reward/std": 0.45010764002799986, + "sampling/importance_sampling_ratio/max": 1.5040478706359863, + "sampling/importance_sampling_ratio/mean": 0.9873509287834168, + "sampling/importance_sampling_ratio/min": 0.370639518648386, + "sampling/sampling_logp_difference/max": 1.1807260036468505, + "sampling/sampling_logp_difference/mean": 0.04055641330778599, + "step": 260, + "step_time": 2.858658364200528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.001923076994717121, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001923076994717121, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.8, + "completions/max_terminated_length": 711.8, + "completions/mean_length": 536.9, + "completions/mean_terminated_length": 536.9, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.08738742098212242, + "epoch": 0.0212, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.020184623077511787, + "kl": 1.8179149568080901, + "learning_rate": 9.949952047936776e-06, + "loss": -0.00012561121257022024, + "num_tokens": 3610314.0, + "reward": 1.1425416231155396, + "reward_std": 0.16104355901479722, + "rewards/env_game_reward/mean": 1.1425416231155396, + "rewards/env_game_reward/std": 0.3060436934232712, + "sampling/importance_sampling_ratio/max": 1.5381874561309814, + "sampling/importance_sampling_ratio/mean": 0.9935122966766358, + "sampling/importance_sampling_ratio/min": 0.4440208673477173, + "sampling/sampling_logp_difference/max": 0.8807984113693237, + "sampling/sampling_logp_difference/mean": 0.026784875988960268, + "step": 265, + "step_time": 2.800866130801296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.6, + "completions/max_terminated_length": 714.6, + "completions/mean_length": 565.0875, + "completions/mean_terminated_length": 565.0875, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.11295722424983978, + "epoch": 0.0216, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.03279825299978256, + "kl": 0.9358411014080048, + "learning_rate": 9.949921679371578e-06, + "loss": -0.0005372571758925915, + "num_tokens": 3699396.0, + "reward": 1.15362491607666, + "reward_std": 0.14572291895747186, + "rewards/env_game_reward/mean": 1.15362491607666, + "rewards/env_game_reward/std": 0.2874349907040596, + "sampling/importance_sampling_ratio/max": 1.4842311143875122, + "sampling/importance_sampling_ratio/mean": 1.0119757771492004, + "sampling/importance_sampling_ratio/min": 0.4642439320683479, + "sampling/sampling_logp_difference/max": 0.8794727861881256, + "sampling/sampling_logp_difference/mean": 0.02908453196287155, + "step": 270, + "step_time": 2.7460473307997746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.003708791360259056, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003708791360259056, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.6, + "completions/max_terminated_length": 729.6, + "completions/mean_length": 578.45, + "completions/mean_terminated_length": 578.45, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.10119448900222779, + "epoch": 0.022, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.040497273206710815, + "kl": 1.5820525407791137, + "learning_rate": 9.949890654982923e-06, + "loss": -0.00091603584587574, + "num_tokens": 3790479.0, + "reward": 1.088499939441681, + "reward_std": 0.26846486926078794, + "rewards/env_game_reward/mean": 1.088499939441681, + "rewards/env_game_reward/std": 0.408444818854332, + "sampling/importance_sampling_ratio/max": 1.219689965248108, + "sampling/importance_sampling_ratio/mean": 0.9480740189552307, + "sampling/importance_sampling_ratio/min": 0.3397506684064865, + "sampling/sampling_logp_difference/max": 1.1102912425994873, + "sampling/sampling_logp_difference/mean": 0.030945492908358574, + "step": 275, + "step_time": 2.881785232200491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0037749288603663445, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0037749288603663445, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.2, + "completions/max_terminated_length": 715.2, + "completions/mean_length": 589.2375, + "completions/mean_terminated_length": 589.2375, + "completions/min_length": 422.4, + "completions/min_terminated_length": 422.4, + "entropy": 0.08195799887180329, + "epoch": 0.0224, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.023325594142079353, + "kl": 1.135708212852478, + "learning_rate": 9.949858974776267e-06, + "loss": -0.000301552121527493, + "num_tokens": 3882655.0, + "reward": 1.191166639328003, + "reward_std": 0.15391356945037843, + "rewards/env_game_reward/mean": 1.191166639328003, + "rewards/env_game_reward/std": 0.2715784251689911, + "sampling/importance_sampling_ratio/max": 1.4824723482131958, + "sampling/importance_sampling_ratio/mean": 1.0203513145446776, + "sampling/importance_sampling_ratio/min": 0.5855113506317139, + "sampling/sampling_logp_difference/max": 0.7591158509254455, + "sampling/sampling_logp_difference/mean": 0.025018543750047684, + "step": 280, + "step_time": 2.7125778502006144 + }, + { + "clip_ratio/high_max": 0.003333333507180214, + "clip_ratio/high_mean": 0.001666666753590107, + "clip_ratio/low_mean": 0.005631868354976177, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0072985351085662845, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.2, + "completions/max_terminated_length": 731.2, + "completions/mean_length": 586.3, + "completions/mean_terminated_length": 586.3, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.08055115789175034, + "epoch": 0.0228, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.028693703934550285, + "kl": 1.138446581363678, + "learning_rate": 9.949826638757177e-06, + "loss": -0.0006620377767831087, + "num_tokens": 3973740.0, + "reward": 1.0940416216850282, + "reward_std": 0.19934517294168472, + "rewards/env_game_reward/mean": 1.0940416216850282, + "rewards/env_game_reward/std": 0.37726728320121766, + "sampling/importance_sampling_ratio/max": 1.7476803302764892, + "sampling/importance_sampling_ratio/mean": 1.0127242803573608, + "sampling/importance_sampling_ratio/min": 0.5199615359306335, + "sampling/sampling_logp_difference/max": 0.7218784093856812, + "sampling/sampling_logp_difference/mean": 0.02444089874625206, + "step": 285, + "step_time": 2.7483612370000627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.6, + "completions/max_terminated_length": 714.6, + "completions/mean_length": 537.3, + "completions/mean_terminated_length": 537.3, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.049627304822206494, + "epoch": 0.0232, + "frac_reward_zero_std": 0.85, + "grad_norm": 0.03257312625646591, + "kl": 0.7880214869976043, + "learning_rate": 9.949793646931339e-06, + "loss": -4.999106749892235e-06, + "num_tokens": 4059661.0, + "reward": 1.2024999141693116, + "reward_std": 0.09192387312650681, + "rewards/env_game_reward/mean": 1.2024999141693116, + "rewards/env_game_reward/std": 0.23650072515010834, + "sampling/importance_sampling_ratio/max": 1.458734655380249, + "sampling/importance_sampling_ratio/mean": 1.007603394985199, + "sampling/importance_sampling_ratio/min": 0.7188777476549149, + "sampling/sampling_logp_difference/max": 0.6383673250675201, + "sampling/sampling_logp_difference/mean": 0.020518184872344136, + "step": 290, + "step_time": 2.739068496599066 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0017241379246115685, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017241379246115685, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.6, + "completions/max_terminated_length": 711.6, + "completions/mean_length": 546.625, + "completions/mean_terminated_length": 546.625, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "entropy": 0.06519524082541465, + "epoch": 0.0236, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.07401054352521896, + "kl": 1.4018354892730713, + "learning_rate": 9.949759999304552e-06, + "loss": -0.00022121451329439878, + "num_tokens": 4146522.0, + "reward": 1.1591666221618653, + "reward_std": 0.168527103215456, + "rewards/env_game_reward/mean": 1.1591666221618653, + "rewards/env_game_reward/std": 0.28247022032737734, + "sampling/importance_sampling_ratio/max": 1.5240054845809936, + "sampling/importance_sampling_ratio/mean": 1.0128975629806518, + "sampling/importance_sampling_ratio/min": 0.6516130924224853, + "sampling/sampling_logp_difference/max": 0.5109567165374755, + "sampling/sampling_logp_difference/mean": 0.01806719144806266, + "step": 295, + "step_time": 2.7780750279984203 + }, + { + "clip_ratio/high_max": 0.003846153989434242, + "clip_ratio/high_mean": 0.001923076994717121, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001923076994717121, + "completions/clipped_ratio": 0.0, + "completions/max_length": 696.4, + "completions/max_terminated_length": 696.4, + "completions/mean_length": 552.8625, + "completions/mean_terminated_length": 552.8625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.05754914581775665, + "epoch": 0.024, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.004387096501886845, + "kl": 0.762263560295105, + "learning_rate": 9.949725695882732e-06, + "loss": 0.00016510837012901903, + "num_tokens": 4234609.0, + "reward": 1.234499979019165, + "reward_std": 0.06163613530807197, + "rewards/env_game_reward/mean": 1.234499979019165, + "rewards/env_game_reward/std": 0.15678054541349412, + "sampling/importance_sampling_ratio/max": 1.2199280977249145, + "sampling/importance_sampling_ratio/mean": 1.0034643888473511, + "sampling/importance_sampling_ratio/min": 0.7295505046844483, + "sampling/sampling_logp_difference/max": 0.5000777631998062, + "sampling/sampling_logp_difference/mean": 0.012467277515679599, + "step": 300, + "step_time": 2.7151460476001374 + }, + { + "epoch": 0.024, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 703.3333333333334, + "eval_completions/max_terminated_length": 703.3333333333334, + "eval_completions/mean_length": 617.3333333333334, + "eval_completions/mean_terminated_length": 617.3333333333334, + "eval_completions/min_length": 481.6666666666667, + "eval_completions/min_terminated_length": 481.6666666666667, + "eval_entropy": 0.0684248556693395, + "eval_frac_reward_zero_std": 0.75, + "eval_kl": 1.430545687675476, + "eval_loss": 0.0002169054205296561, + "eval_num_tokens": 4234609.0, + "eval_reward": 1.2093055248260498, + "eval_reward_std": 0.07719248533248901, + "eval_rewards/env_game_reward/mean": 1.2093055248260498, + "eval_rewards/env_game_reward/std": 0.15293016036351523, + "eval_runtime": 2.8392, + "eval_samples_per_second": 3.522, + "eval_sampling/importance_sampling_ratio/max": 1.2929344177246094, + "eval_sampling/importance_sampling_ratio/mean": 1.0424925486246746, + "eval_sampling/importance_sampling_ratio/min": 0.8097424507141113, + "eval_sampling/sampling_logp_difference/max": 0.27991748849550885, + "eval_sampling/sampling_logp_difference/mean": 0.015183204164107641, + "eval_steps_per_second": 0.704, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 528.225, + "completions/mean_terminated_length": 528.225, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.0414741288870573, + "epoch": 0.0244, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.00968001689761877, + "kl": 1.175690919160843, + "learning_rate": 9.949690736671905e-06, + "loss": -1.1410797014832497e-05, + "num_tokens": 4319539.0, + "reward": 1.2401665925979615, + "reward_std": 0.06929646017961204, + "rewards/env_game_reward/mean": 1.2401665925979615, + "rewards/env_game_reward/std": 0.18133917301893235, + "sampling/importance_sampling_ratio/max": 1.2353633403778077, + "sampling/importance_sampling_ratio/mean": 0.9993547916412353, + "sampling/importance_sampling_ratio/min": 0.5523549929261208, + "sampling/sampling_logp_difference/max": 1.0161285698413849, + "sampling/sampling_logp_difference/mean": 0.01637693466618657, + "step": 305, + "step_time": 2.655280681199656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.001785714365541935, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001785714365541935, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 581.925, + "completions/mean_terminated_length": 581.925, + "completions/min_length": 452.4, + "completions/min_terminated_length": 452.4, + "entropy": 0.05037630945444107, + "epoch": 0.0248, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.01921161077916622, + "kl": 1.3029194891452789, + "learning_rate": 9.949655121678223e-06, + "loss": -0.00036651836708188055, + "num_tokens": 4411139.0, + "reward": 1.234999942779541, + "reward_std": 0.09192387759685516, + "rewards/env_game_reward/mean": 1.234999942779541, + "rewards/env_game_reward/std": 0.16591952741146088, + "sampling/importance_sampling_ratio/max": 1.1560017108917235, + "sampling/importance_sampling_ratio/mean": 0.9992516279220581, + "sampling/importance_sampling_ratio/min": 0.6994159758090973, + "sampling/sampling_logp_difference/max": 0.527313782274723, + "sampling/sampling_logp_difference/mean": 0.012010546866804361, + "step": 310, + "step_time": 2.6350250776005852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 713.8, + "completions/max_terminated_length": 713.8, + "completions/mean_length": 542.4375, + "completions/mean_terminated_length": 542.4375, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.054155930504202844, + "epoch": 0.0252, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.020733093842864037, + "kl": 1.5725973546504974, + "learning_rate": 9.949618850907941e-06, + "loss": 0.00024796247016638516, + "num_tokens": 4498257.0, + "reward": 1.2346249580383302, + "reward_std": 0.09210064932703972, + "rewards/env_game_reward/mean": 1.2346249580383302, + "rewards/env_game_reward/std": 0.16266353130340577, + "sampling/importance_sampling_ratio/max": 1.3359755039215089, + "sampling/importance_sampling_ratio/mean": 1.0149800062179566, + "sampling/importance_sampling_ratio/min": 0.7080724120140076, + "sampling/sampling_logp_difference/max": 0.4858335077762604, + "sampling/sampling_logp_difference/mean": 0.011928107030689717, + "step": 315, + "step_time": 2.831428350000351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002083333395421505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002083333395421505, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 579.0625, + "completions/mean_terminated_length": 579.0625, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.07051767148077488, + "epoch": 0.0256, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.01131022721529007, + "kl": 0.9661010444164276, + "learning_rate": 9.949581924367437e-06, + "loss": -0.0006595293991267682, + "num_tokens": 4588831.0, + "reward": 1.2023749113082887, + "reward_std": 0.13806259408593177, + "rewards/env_game_reward/mean": 1.2023749113082887, + "rewards/env_game_reward/std": 0.2536344364285469, + "sampling/importance_sampling_ratio/max": 1.3653532981872558, + "sampling/importance_sampling_ratio/mean": 1.0079243421554565, + "sampling/importance_sampling_ratio/min": 0.5111093282699585, + "sampling/sampling_logp_difference/max": 0.7481902837753296, + "sampling/sampling_logp_difference/mean": 0.020597192458808422, + "step": 320, + "step_time": 2.711340737000137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.4, + "completions/max_terminated_length": 714.4, + "completions/mean_length": 556.4625, + "completions/mean_terminated_length": 556.4625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.059214940294623375, + "epoch": 0.026, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.03349935635924339, + "kl": 1.0856446743011474, + "learning_rate": 9.9495443420632e-06, + "loss": -0.0004986363928765059, + "num_tokens": 4676780.0, + "reward": 1.191541600227356, + "reward_std": 0.12274194019846618, + "rewards/env_game_reward/mean": 1.191541600227356, + "rewards/env_game_reward/std": 0.2498372197151184, + "sampling/importance_sampling_ratio/max": 1.12011821269989, + "sampling/importance_sampling_ratio/mean": 0.9541650176048279, + "sampling/importance_sampling_ratio/min": 0.45518134236335756, + "sampling/sampling_logp_difference/max": 1.0134361952543258, + "sampling/sampling_logp_difference/mean": 0.02279095593839884, + "step": 325, + "step_time": 2.610014723199856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.2, + "completions/max_terminated_length": 698.2, + "completions/mean_length": 564.2, + "completions/mean_terminated_length": 564.2, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.06921758279204368, + "epoch": 0.0264, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.04242366924881935, + "kl": 1.510494002699852, + "learning_rate": 9.949506104001843e-06, + "loss": -0.000468868063762784, + "num_tokens": 4765601.0, + "reward": 1.1211249828338623, + "reward_std": 0.20700550079345703, + "rewards/env_game_reward/mean": 1.1211249828338623, + "rewards/env_game_reward/std": 0.34373362362384796, + "sampling/importance_sampling_ratio/max": 1.384414553642273, + "sampling/importance_sampling_ratio/mean": 0.9697236299514771, + "sampling/importance_sampling_ratio/min": 0.4015564054250717, + "sampling/sampling_logp_difference/max": 1.096415627002716, + "sampling/sampling_logp_difference/mean": 0.023835126869380475, + "step": 330, + "step_time": 2.618563677399652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0038518518209457397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0038518518209457397, + "completions/clipped_ratio": 0.0, + "completions/max_length": 731.8, + "completions/max_terminated_length": 731.8, + "completions/mean_length": 577.3, + "completions/mean_terminated_length": 577.3, + "completions/min_length": 396.8, + "completions/min_terminated_length": 396.8, + "entropy": 0.07292215041816234, + "epoch": 0.0268, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.010667292401194572, + "kl": 1.164840006828308, + "learning_rate": 9.94946721019008e-06, + "loss": -0.0007914766669273376, + "num_tokens": 4855821.0, + "reward": 1.2240416049957275, + "reward_std": 0.09210064932703972, + "rewards/env_game_reward/mean": 1.2240416049957275, + "rewards/env_game_reward/std": 0.18175842016935348, + "sampling/importance_sampling_ratio/max": 1.1197868585586548, + "sampling/importance_sampling_ratio/mean": 0.9610379815101624, + "sampling/importance_sampling_ratio/min": 0.43021807074546814, + "sampling/sampling_logp_difference/max": 6.813688334822655, + "sampling/sampling_logp_difference/mean": 0.10667178835719823, + "step": 335, + "step_time": 2.7418013950002207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0018518518656492234, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0018518518656492234, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.2, + "completions/max_terminated_length": 730.2, + "completions/mean_length": 560.125, + "completions/mean_terminated_length": 560.125, + "completions/min_length": 420.8, + "completions/min_terminated_length": 420.8, + "entropy": 0.05461762771010399, + "epoch": 0.0272, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.008303332142531872, + "kl": 1.1738501071929932, + "learning_rate": 9.949427660634754e-06, + "loss": -0.00020156898535788058, + "num_tokens": 4944599.0, + "reward": 1.2348749399185182, + "reward_std": 0.09210065379738808, + "rewards/env_game_reward/mean": 1.2348749399185182, + "rewards/env_game_reward/std": 0.19004902094602585, + "sampling/importance_sampling_ratio/max": 1.2192065000534058, + "sampling/importance_sampling_ratio/mean": 0.993685245513916, + "sampling/importance_sampling_ratio/min": 0.5723252117633819, + "sampling/sampling_logp_difference/max": 0.6896804094314575, + "sampling/sampling_logp_difference/mean": 0.01418588999658823, + "step": 340, + "step_time": 2.7300288166021347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.001923076994717121, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001923076994717121, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.4, + "completions/max_terminated_length": 715.4, + "completions/mean_length": 569.075, + "completions/mean_terminated_length": 569.075, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.06558555997908115, + "epoch": 0.0276, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.049474235624074936, + "kl": 1.3128838062286377, + "learning_rate": 9.949387455342814e-06, + "loss": -0.00022792629897594452, + "num_tokens": 5033871.0, + "reward": 1.2022499322891236, + "reward_std": 0.13823936092667283, + "rewards/env_game_reward/mean": 1.2022499322891236, + "rewards/env_game_reward/std": 0.24752375930547715, + "sampling/importance_sampling_ratio/max": 1.2796708583831786, + "sampling/importance_sampling_ratio/mean": 0.9890965223312378, + "sampling/importance_sampling_ratio/min": 0.5188629031181335, + "sampling/sampling_logp_difference/max": 0.6587271898984909, + "sampling/sampling_logp_difference/mean": 0.015436801221221685, + "step": 345, + "step_time": 2.8809383545994933 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002083333395421505, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002083333395421505, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.8, + "completions/max_terminated_length": 729.8, + "completions/mean_length": 572.375, + "completions/mean_terminated_length": 572.375, + "completions/min_length": 421.6, + "completions/min_terminated_length": 421.6, + "entropy": 0.06578464694321155, + "epoch": 0.028, + "frac_reward_zero_std": 0.825, + "grad_norm": 0.00010816263238666579, + "kl": 1.1110517114400864, + "learning_rate": 9.949346594321329e-06, + "loss": -0.0002046652138233185, + "num_tokens": 5123373.0, + "reward": 1.2458332538604737, + "reward_std": 0.07660322934389115, + "rewards/env_game_reward/mean": 1.2458332538604737, + "rewards/env_game_reward/std": 0.16050206124782562, + "sampling/importance_sampling_ratio/max": 1.1853637456893922, + "sampling/importance_sampling_ratio/mean": 0.9942168712615966, + "sampling/importance_sampling_ratio/min": 0.770496129989624, + "sampling/sampling_logp_difference/max": 0.3065337672829628, + "sampling/sampling_logp_difference/mean": 0.010001219715923071, + "step": 350, + "step_time": 2.7594579180004075 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 732.0, + "completions/max_terminated_length": 732.0, + "completions/mean_length": 590.825, + "completions/mean_terminated_length": 590.825, + "completions/min_length": 436.0, + "completions/min_terminated_length": 436.0, + "entropy": 0.0713444285094738, + "epoch": 0.0284, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.02447124384343624, + "kl": 0.9440391480922699, + "learning_rate": 9.949305077577481e-06, + "loss": -0.0007917589507997036, + "num_tokens": 5214490.0, + "reward": 1.2240416049957275, + "reward_std": 0.07678000405430793, + "rewards/env_game_reward/mean": 1.2240416049957275, + "rewards/env_game_reward/std": 0.20674404948949815, + "sampling/importance_sampling_ratio/max": 1.48435320854187, + "sampling/importance_sampling_ratio/mean": 1.0448272943496704, + "sampling/importance_sampling_ratio/min": 0.6205300271511078, + "sampling/sampling_logp_difference/max": 0.699562880396843, + "sampling/sampling_logp_difference/mean": 0.019524367339909077, + "step": 355, + "step_time": 2.7076195863992325 + }, + { + "clip_ratio/high_max": 0.003846153989434242, + "clip_ratio/high_mean": 0.001923076994717121, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001923076994717121, + "completions/clipped_ratio": 0.0, + "completions/max_length": 681.0, + "completions/max_terminated_length": 681.0, + "completions/mean_length": 547.5, + "completions/mean_terminated_length": 547.5, + "completions/min_length": 408.2, + "completions/min_terminated_length": 408.2, + "entropy": 0.07489714697003365, + "epoch": 0.0288, + "frac_reward_zero_std": 0.7, + "grad_norm": 0.018365204334259033, + "kl": 1.3538099706172944, + "learning_rate": 9.949262905118568e-06, + "loss": -0.0006200538948178292, + "num_tokens": 5301101.0, + "reward": 1.2171249389648438, + "reward_std": 0.10188229638151824, + "rewards/env_game_reward/mean": 1.2171249389648438, + "rewards/env_game_reward/std": 0.18957828879356384, + "sampling/importance_sampling_ratio/max": 1.3922180652618408, + "sampling/importance_sampling_ratio/mean": 0.9885348677635193, + "sampling/importance_sampling_ratio/min": 0.6369173884391784, + "sampling/sampling_logp_difference/max": 0.6018468141555786, + "sampling/sampling_logp_difference/mean": 0.018113284837454557, + "step": 360, + "step_time": 2.585118543999852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.001923076994717121, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001923076994717121, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.6, + "completions/max_terminated_length": 715.6, + "completions/mean_length": 547.125, + "completions/mean_terminated_length": 547.125, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.061598078906536104, + "epoch": 0.0292, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.02872176468372345, + "kl": 1.7623548924922943, + "learning_rate": 9.949220076952004e-06, + "loss": 0.00013176617212593556, + "num_tokens": 5387242.0, + "reward": 1.229458260536194, + "reward_std": 0.09976096972823142, + "rewards/env_game_reward/mean": 1.229458260536194, + "rewards/env_game_reward/std": 0.18861736208200455, + "sampling/importance_sampling_ratio/max": 1.1277338027954102, + "sampling/importance_sampling_ratio/mean": 0.9845755219459533, + "sampling/importance_sampling_ratio/min": 0.6804550766944886, + "sampling/sampling_logp_difference/max": 0.5396802634000778, + "sampling/sampling_logp_difference/mean": 0.012317924201488495, + "step": 365, + "step_time": 2.8329780505999222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 714.6, + "completions/max_terminated_length": 714.6, + "completions/mean_length": 546.5375, + "completions/mean_terminated_length": 546.5375, + "completions/min_length": 421.6, + "completions/min_terminated_length": 421.6, + "entropy": 0.06278647035360337, + "epoch": 0.0296, + "frac_reward_zero_std": 0.8, + "grad_norm": 0.008219737559556961, + "kl": 1.525150203704834, + "learning_rate": 9.949176593085315e-06, + "loss": 0.00031437948346138, + "num_tokens": 5474001.0, + "reward": 1.191666555404663, + "reward_std": 0.09192387461662292, + "rewards/env_game_reward/mean": 1.191666555404663, + "rewards/env_game_reward/std": 0.244113752245903, + "sampling/importance_sampling_ratio/max": 1.2466303348541259, + "sampling/importance_sampling_ratio/mean": 0.99187251329422, + "sampling/importance_sampling_ratio/min": 0.6674005150794983, + "sampling/sampling_logp_difference/max": 0.5849884033203125, + "sampling/sampling_logp_difference/mean": 0.013392899371683597, + "step": 370, + "step_time": 2.6906030645986903 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 558.05, + "completions/mean_terminated_length": 558.05, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.06691324710845947, + "epoch": 0.03, + "frac_reward_zero_std": 0.775, + "grad_norm": 0.016666624695062637, + "kl": 0.9236783385276794, + "learning_rate": 9.94913245352615e-06, + "loss": 7.541242521256209e-05, + "num_tokens": 5561757.0, + "reward": 1.2079166412353515, + "reward_std": 0.11490484774112701, + "rewards/env_game_reward/mean": 1.2079166412353515, + "rewards/env_game_reward/std": 0.2623890072107315, + "sampling/importance_sampling_ratio/max": 1.1636013031005858, + "sampling/importance_sampling_ratio/mean": 0.9869522213935852, + "sampling/importance_sampling_ratio/min": 0.5432810723781586, + "sampling/sampling_logp_difference/max": 0.6354977548122406, + "sampling/sampling_logp_difference/mean": 0.011450758669525385, + "step": 375, + "step_time": 2.7167678525976955 + }, + { + "epoch": 0.03, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 702.3333333333334, + "eval_completions/max_terminated_length": 702.3333333333334, + "eval_completions/mean_length": 618.25, + "eval_completions/mean_terminated_length": 618.25, + "eval_completions/min_length": 482.3333333333333, + "eval_completions/min_terminated_length": 482.3333333333333, + "eval_entropy": 0.08387033144632976, + "eval_frac_reward_zero_std": 0.5, + "eval_kl": 0.9737871090571085, + "eval_loss": -0.0007849211106076837, + "eval_num_tokens": 5561757.0, + "eval_reward": 1.1370832522710164, + "eval_reward_std": 0.23039893666282296, + "eval_rewards/env_game_reward/mean": 1.1370832522710164, + "eval_rewards/env_game_reward/std": 0.30880257207900286, + "eval_runtime": 2.8647, + "eval_samples_per_second": 3.491, + "eval_sampling/importance_sampling_ratio/max": 1.4039711157480876, + "eval_sampling/importance_sampling_ratio/mean": 1.006952742735545, + "eval_sampling/importance_sampling_ratio/min": 0.7436001698176066, + "eval_sampling/sampling_logp_difference/max": 0.5522954861323038, + "eval_sampling/sampling_logp_difference/mean": 0.019840245756010216, + "eval_steps_per_second": 0.698, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0014705882407724858, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014705882407724858, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.8, + "completions/max_terminated_length": 937.8, + "completions/mean_length": 735.0, + "completions/mean_terminated_length": 735.0, + "completions/min_length": 517.2, + "completions/min_terminated_length": 517.2, + "entropy": 0.0695138342678547, + "epoch": 0.0304, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.023199956864118576, + "kl": 1.087385755777359, + "learning_rate": 9.949087658282265e-06, + "loss": 0.00038635083474218844, + "num_tokens": 5665353.0, + "reward": 1.2418749570846557, + "reward_std": 0.12957732044160367, + "rewards/env_game_reward/mean": 1.2418749570846557, + "rewards/env_game_reward/std": 0.24777153581380845, + "sampling/importance_sampling_ratio/max": 1.760440754890442, + "sampling/importance_sampling_ratio/mean": 1.0248130679130554, + "sampling/importance_sampling_ratio/min": 0.5629997849464417, + "sampling/sampling_logp_difference/max": 0.7708295106887817, + "sampling/sampling_logp_difference/mean": 0.016029300354421138, + "step": 380, + "step_time": 3.315412589598418 + }, + { + "clip_ratio/high_max": 0.002857142873108387, + "clip_ratio/high_mean": 0.0014285714365541934, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014285714365541934, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1022.0, + "completions/max_terminated_length": 1022.0, + "completions/mean_length": 839.425, + "completions/mean_terminated_length": 839.425, + "completions/min_length": 611.2, + "completions/min_terminated_length": 611.2, + "entropy": 0.08930239342153072, + "epoch": 0.0308, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.07401212304830551, + "kl": 1.702320432662964, + "learning_rate": 9.949042207361537e-06, + "loss": -0.0006852170452475547, + "num_tokens": 5776827.0, + "reward": 1.1849999666213988, + "reward_std": 0.20470741838216783, + "rewards/env_game_reward/mean": 1.1849999666213988, + "rewards/env_game_reward/std": 0.30558564364910124, + "sampling/importance_sampling_ratio/max": 1.5303932189941407, + "sampling/importance_sampling_ratio/mean": 0.9755434513092041, + "sampling/importance_sampling_ratio/min": 0.5129167795181274, + "sampling/sampling_logp_difference/max": 0.8233615517616272, + "sampling/sampling_logp_difference/mean": 0.02174628246575594, + "step": 385, + "step_time": 3.589076187599858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0029910714365541935, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029910714365541935, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.8, + "completions/max_terminated_length": 1050.8, + "completions/mean_length": 794.475, + "completions/mean_terminated_length": 794.475, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "entropy": 0.09299433380365371, + "epoch": 0.0312, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.042726535350084305, + "kl": 0.9145794808864594, + "learning_rate": 9.948996100771952e-06, + "loss": 0.0006041613407433033, + "num_tokens": 5884075.0, + "reward": 1.1981249570846557, + "reward_std": 0.18649942204356193, + "rewards/env_game_reward/mean": 1.1981249570846557, + "rewards/env_game_reward/std": 0.30469191670417783, + "sampling/importance_sampling_ratio/max": 1.5661879777908325, + "sampling/importance_sampling_ratio/mean": 1.0027704000473023, + "sampling/importance_sampling_ratio/min": 0.5443926572799682, + "sampling/sampling_logp_difference/max": 0.6991547107696533, + "sampling/sampling_logp_difference/mean": 0.02191640790551901, + "step": 390, + "step_time": 3.5801626390006276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002985739801079035, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002985739801079035, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.8, + "completions/max_terminated_length": 1047.8, + "completions/mean_length": 831.9625, + "completions/mean_terminated_length": 831.9625, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.10950318947434426, + "epoch": 0.0316, + "frac_reward_zero_std": 0.525, + "grad_norm": 0.024459250271320343, + "kl": 1.3264178693294526, + "learning_rate": 9.948949338521616e-06, + "loss": -0.00011393972672522069, + "num_tokens": 5994509.0, + "reward": 1.2248749732971191, + "reward_std": 0.16104357689619064, + "rewards/env_game_reward/mean": 1.2248749732971191, + "rewards/env_game_reward/std": 0.2880355179309845, + "sampling/importance_sampling_ratio/max": 1.3253472089767455, + "sampling/importance_sampling_ratio/mean": 1.016199254989624, + "sampling/importance_sampling_ratio/min": 0.6328357100486756, + "sampling/sampling_logp_difference/max": 0.5666046380996704, + "sampling/sampling_logp_difference/mean": 0.016961843892931937, + "step": 395, + "step_time": 3.593138352200185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0027046783827245234, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027046783827245234, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 865.2375, + "completions/mean_terminated_length": 865.2375, + "completions/min_length": 651.6, + "completions/min_terminated_length": 651.6, + "entropy": 0.12832557484507562, + "epoch": 0.032, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.0840858444571495, + "kl": 1.16541907787323, + "learning_rate": 9.948901920618752e-06, + "loss": -0.0007970237173140049, + "num_tokens": 6108518.0, + "reward": 1.171999979019165, + "reward_std": 0.2605688437819481, + "rewards/env_game_reward/mean": 1.171999979019165, + "rewards/env_game_reward/std": 0.36585823595523836, + "sampling/importance_sampling_ratio/max": 1.6069575786590575, + "sampling/importance_sampling_ratio/mean": 0.9940932035446167, + "sampling/importance_sampling_ratio/min": 0.49497389793395996, + "sampling/sampling_logp_difference/max": 0.7419742822647095, + "sampling/sampling_logp_difference/mean": 0.023178784735500814, + "step": 400, + "step_time": 3.5148092628012817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002878289483487606, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002878289483487606, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.4, + "completions/max_terminated_length": 1044.4, + "completions/mean_length": 828.5875, + "completions/mean_terminated_length": 828.5875, + "completions/min_length": 610.4, + "completions/min_terminated_length": 610.4, + "entropy": 0.11367225870490075, + "epoch": 0.0324, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.07428044080734253, + "kl": 1.0149361670017243, + "learning_rate": 9.948853847071691e-06, + "loss": 0.00038471841253340244, + "num_tokens": 6218492.0, + "reward": 1.1938749551773071, + "reward_std": 0.25438166558742525, + "rewards/env_game_reward/mean": 1.1938749551773071, + "rewards/env_game_reward/std": 0.321139919757843, + "sampling/importance_sampling_ratio/max": 1.361912488937378, + "sampling/importance_sampling_ratio/mean": 1.0066336393356323, + "sampling/importance_sampling_ratio/min": 0.7280618786811829, + "sampling/sampling_logp_difference/max": 0.47970104217529297, + "sampling/sampling_logp_difference/mean": 0.013811025116592646, + "step": 405, + "step_time": 3.7229729885992127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0029910714365541935, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029910714365541935, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.4, + "completions/max_terminated_length": 1047.4, + "completions/mean_length": 831.125, + "completions/mean_terminated_length": 831.125, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "entropy": 0.0979391686618328, + "epoch": 0.0328, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.057129692286252975, + "kl": 1.3072730243206023, + "learning_rate": 9.948805117888886e-06, + "loss": 4.2584206676110624e-05, + "num_tokens": 6328847.0, + "reward": 1.194124960899353, + "reward_std": 0.21690500825643538, + "rewards/env_game_reward/mean": 1.194124960899353, + "rewards/env_game_reward/std": 0.3049228638410568, + "sampling/importance_sampling_ratio/max": 1.4106521368026734, + "sampling/importance_sampling_ratio/mean": 0.9964896202087402, + "sampling/importance_sampling_ratio/min": 0.6699582457542419, + "sampling/sampling_logp_difference/max": 0.5707800269126893, + "sampling/sampling_logp_difference/mean": 0.016702639311552046, + "step": 410, + "step_time": 3.5735276011997485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002857142873108387, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002857142873108387, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.6, + "completions/max_terminated_length": 1072.6, + "completions/mean_length": 851.675, + "completions/mean_terminated_length": 851.675, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.09468646496534347, + "epoch": 0.0332, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03625418618321419, + "kl": 0.9998983383178711, + "learning_rate": 9.948755733078905e-06, + "loss": 0.0005254603922367096, + "num_tokens": 6441493.0, + "reward": 1.203125, + "reward_std": 0.21655145585536956, + "rewards/env_game_reward/mean": 1.203125, + "rewards/env_game_reward/std": 0.3023660510778427, + "sampling/importance_sampling_ratio/max": 1.5803467512130738, + "sampling/importance_sampling_ratio/mean": 1.0241896748542785, + "sampling/importance_sampling_ratio/min": 0.5122612774372101, + "sampling/sampling_logp_difference/max": 0.7667708158493042, + "sampling/sampling_logp_difference/mean": 0.016751198470592497, + "step": 415, + "step_time": 3.535725635799463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002129505993798375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002129505993798375, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.6, + "completions/max_terminated_length": 1072.6, + "completions/mean_length": 812.1, + "completions/mean_terminated_length": 812.1, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "entropy": 0.16425883509218692, + "epoch": 0.0336, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04025555029511452, + "kl": 1.0152770042419434, + "learning_rate": 9.948705692650427e-06, + "loss": 0.0013656719587743283, + "num_tokens": 6549145.0, + "reward": 1.2017499923706054, + "reward_std": 0.18137289136648177, + "rewards/env_game_reward/mean": 1.2017499923706054, + "rewards/env_game_reward/std": 0.2834285110235214, + "sampling/importance_sampling_ratio/max": 1.2725768566131592, + "sampling/importance_sampling_ratio/mean": 0.9965269207954407, + "sampling/importance_sampling_ratio/min": 0.5284772694110871, + "sampling/sampling_logp_difference/max": 0.6862696170806885, + "sampling/sampling_logp_difference/mean": 0.01916008032858372, + "step": 420, + "step_time": 3.653649864799809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.2, + "completions/max_terminated_length": 1046.2, + "completions/mean_length": 848.475, + "completions/mean_terminated_length": 848.475, + "completions/min_length": 617.6, + "completions/min_terminated_length": 617.6, + "entropy": 0.10671765431761741, + "epoch": 0.034, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0357169471681118, + "kl": 1.2555115342140197, + "learning_rate": 9.948654996612248e-06, + "loss": -0.0010916889645159245, + "num_tokens": 6661416.0, + "reward": 1.1152499675750733, + "reward_std": 0.241830512881279, + "rewards/env_game_reward/mean": 1.1152499675750733, + "rewards/env_game_reward/std": 0.3348828315734863, + "sampling/importance_sampling_ratio/max": 1.2912365436553954, + "sampling/importance_sampling_ratio/mean": 0.9513557910919189, + "sampling/importance_sampling_ratio/min": 0.4529180288314819, + "sampling/sampling_logp_difference/max": 0.838625431060791, + "sampling/sampling_logp_difference/mean": 0.02186411377042532, + "step": 425, + "step_time": 3.7595838884000843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0014285714365541934, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014285714365541934, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.2, + "completions/max_terminated_length": 1047.2, + "completions/mean_length": 808.375, + "completions/mean_terminated_length": 808.375, + "completions/min_length": 631.4, + "completions/min_terminated_length": 631.4, + "entropy": 0.0735358338803053, + "epoch": 0.0344, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.024515505880117416, + "kl": 1.176445186138153, + "learning_rate": 9.948603644973282e-06, + "loss": 0.00011715407017618418, + "num_tokens": 6769624.0, + "reward": 1.2248749732971191, + "reward_std": 0.2105410486459732, + "rewards/env_game_reward/mean": 1.2248749732971191, + "rewards/env_game_reward/std": 0.2968492805957794, + "sampling/importance_sampling_ratio/max": 1.4149769067764282, + "sampling/importance_sampling_ratio/mean": 1.0460476994514465, + "sampling/importance_sampling_ratio/min": 0.855708372592926, + "sampling/sampling_logp_difference/max": 0.2869324326515198, + "sampling/sampling_logp_difference/mean": 0.010842377878725528, + "step": 430, + "step_time": 3.4251211591988975 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.8, + "completions/max_terminated_length": 996.8, + "completions/mean_length": 800.6125, + "completions/mean_terminated_length": 800.6125, + "completions/min_length": 578.0, + "completions/min_terminated_length": 578.0, + "entropy": 0.05469609908759594, + "epoch": 0.0348, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02258330211043358, + "kl": 1.0658069968223571, + "learning_rate": 9.948551637742554e-06, + "loss": 0.00048186536878347397, + "num_tokens": 6876746.0, + "reward": 1.2204999208450318, + "reward_std": 0.19197950065135955, + "rewards/env_game_reward/mean": 1.2204999208450318, + "rewards/env_game_reward/std": 0.2575215369462967, + "sampling/importance_sampling_ratio/max": 1.7662209749221802, + "sampling/importance_sampling_ratio/mean": 1.0243075132369994, + "sampling/importance_sampling_ratio/min": 0.6089048445224762, + "sampling/sampling_logp_difference/max": 0.8979941129684448, + "sampling/sampling_logp_difference/mean": 0.016185545828193426, + "step": 435, + "step_time": 3.3869100240008265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1071.6, + "completions/max_terminated_length": 1071.6, + "completions/mean_length": 838.85, + "completions/mean_terminated_length": 838.85, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "entropy": 0.04340629056096077, + "epoch": 0.0352, + "frac_reward_zero_std": 0.675, + "grad_norm": 0.025899723172187805, + "kl": 1.2329406023025513, + "learning_rate": 9.948498974929206e-06, + "loss": 0.0002917660400271416, + "num_tokens": 6987662.0, + "reward": 1.2731249809265137, + "reward_std": 0.12993087247014046, + "rewards/env_game_reward/mean": 1.2731249809265137, + "rewards/env_game_reward/std": 0.2441350817680359, + "sampling/importance_sampling_ratio/max": 1.1621084928512573, + "sampling/importance_sampling_ratio/mean": 0.9838597655296326, + "sampling/importance_sampling_ratio/min": 0.6955267369747162, + "sampling/sampling_logp_difference/max": 0.48128714561462405, + "sampling/sampling_logp_difference/mean": 0.007422756869345903, + "step": 440, + "step_time": 3.551285775598808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0015151515603065492, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015151515603065492, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 835.5875, + "completions/mean_terminated_length": 835.5875, + "completions/min_length": 610.4, + "completions/min_terminated_length": 610.4, + "entropy": 0.04846553523093462, + "epoch": 0.0356, + "frac_reward_zero_std": 0.6, + "grad_norm": 0.15698401629924774, + "kl": 1.8966678202152252, + "learning_rate": 9.948445656542496e-06, + "loss": 0.0002397662028670311, + "num_tokens": 7098696.0, + "reward": 1.2731249809265137, + "reward_std": 0.14230524450540544, + "rewards/env_game_reward/mean": 1.2731249809265137, + "rewards/env_game_reward/std": 0.2304918497800827, + "sampling/importance_sampling_ratio/max": 1.4296421527862548, + "sampling/importance_sampling_ratio/mean": 1.0030879974365234, + "sampling/importance_sampling_ratio/min": 0.5428477764129639, + "sampling/sampling_logp_difference/max": 0.6600507885217667, + "sampling/sampling_logp_difference/mean": 0.011323550157248974, + "step": 445, + "step_time": 3.5293831905983097 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0027046783827245234, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027046783827245234, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 813.675, + "completions/mean_terminated_length": 813.675, + "completions/min_length": 618.8, + "completions/min_terminated_length": 618.8, + "entropy": 0.07172373905777932, + "epoch": 0.036, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.030395962297916412, + "kl": 1.2917304337024689, + "learning_rate": 9.948391682591799e-06, + "loss": 3.6328425630927085e-05, + "num_tokens": 7207378.0, + "reward": 1.1674999475479126, + "reward_std": 0.2174353301525116, + "rewards/env_game_reward/mean": 1.1674999475479126, + "rewards/env_game_reward/std": 0.32538898289203644, + "sampling/importance_sampling_ratio/max": 1.4234293460845948, + "sampling/importance_sampling_ratio/mean": 0.9796630859375, + "sampling/importance_sampling_ratio/min": 0.616313761472702, + "sampling/sampling_logp_difference/max": 0.6073259890079499, + "sampling/sampling_logp_difference/mean": 0.014473339542746544, + "step": 450, + "step_time": 3.6695362117985497 + }, + { + "epoch": 0.036, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1031.6666666666667, + "eval_completions/max_terminated_length": 1031.6666666666667, + "eval_completions/mean_length": 903.7083333333334, + "eval_completions/mean_terminated_length": 903.7083333333334, + "eval_completions/min_length": 700.0, + "eval_completions/min_terminated_length": 700.0, + "eval_entropy": 0.09484085440635681, + "eval_frac_reward_zero_std": 0.3333333333333333, + "eval_kl": 2.29697722196579, + "eval_loss": -0.0005131486104801297, + "eval_num_tokens": 7207378.0, + "eval_reward": 1.1229166587193806, + "eval_reward_std": 0.1856155296166738, + "eval_rewards/env_game_reward/mean": 1.1229166587193806, + "eval_rewards/env_game_reward/std": 0.2910057157278061, + "eval_runtime": 3.7272, + "eval_samples_per_second": 2.683, + "eval_sampling/importance_sampling_ratio/max": 1.1381525993347168, + "eval_sampling/importance_sampling_ratio/mean": 0.9641783038775126, + "eval_sampling/importance_sampling_ratio/min": 0.7627586523691813, + "eval_sampling/sampling_logp_difference/max": 0.25132163365681964, + "eval_sampling/sampling_logp_difference/mean": 0.012963244070609411, + "eval_steps_per_second": 0.537, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.2, + "completions/max_terminated_length": 1047.2, + "completions/mean_length": 834.8, + "completions/mean_terminated_length": 834.8, + "completions/min_length": 655.4, + "completions/min_terminated_length": 655.4, + "entropy": 0.07156877405941486, + "epoch": 0.0364, + "frac_reward_zero_std": 0.65, + "grad_norm": 0.02007192373275757, + "kl": 1.030625057220459, + "learning_rate": 9.9483370530866e-06, + "loss": -0.00047754105180501937, + "num_tokens": 7318135.0, + "reward": 1.259999966621399, + "reward_std": 0.12374369204044341, + "rewards/env_game_reward/mean": 1.259999966621399, + "rewards/env_game_reward/std": 0.2538477838039398, + "sampling/importance_sampling_ratio/max": 1.0907421350479125, + "sampling/importance_sampling_ratio/mean": 0.9758262634277344, + "sampling/importance_sampling_ratio/min": 0.6034112870693207, + "sampling/sampling_logp_difference/max": 0.446870756149292, + "sampling/sampling_logp_difference/mean": 0.008248830866068601, + "step": 455, + "step_time": 3.3988218826001684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0012820512987673283, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012820512987673283, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1047.8, + "completions/max_terminated_length": 1047.8, + "completions/mean_length": 826.6875, + "completions/mean_terminated_length": 826.6875, + "completions/min_length": 590.2, + "completions/min_terminated_length": 590.2, + "entropy": 0.08540964387357235, + "epoch": 0.0368, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.06558004766702652, + "kl": 0.8614380896091461, + "learning_rate": 9.948281768036502e-06, + "loss": 8.575599640607834e-05, + "num_tokens": 7427814.0, + "reward": 1.2334999561309814, + "reward_std": 0.1983434498310089, + "rewards/env_game_reward/mean": 1.2334999561309814, + "rewards/env_game_reward/std": 0.29357832968235015, + "sampling/importance_sampling_ratio/max": 1.2517836332321166, + "sampling/importance_sampling_ratio/mean": 0.9869768977165222, + "sampling/importance_sampling_ratio/min": 0.591094845533371, + "sampling/sampling_logp_difference/max": 0.5167952656745911, + "sampling/sampling_logp_difference/mean": 0.011756654176861048, + "step": 460, + "step_time": 3.48071748059956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002857142873108387, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002857142873108387, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1071.2, + "completions/max_terminated_length": 1071.2, + "completions/mean_length": 860.525, + "completions/mean_terminated_length": 860.525, + "completions/min_length": 611.6, + "completions/min_terminated_length": 611.6, + "entropy": 0.0760620430111885, + "epoch": 0.0372, + "frac_reward_zero_std": 0.55, + "grad_norm": 0.03671133518218994, + "kl": 1.1999324023723603, + "learning_rate": 9.948225827451225e-06, + "loss": 0.0007083296775817871, + "num_tokens": 7540850.0, + "reward": 1.2249999523162842, + "reward_std": 0.19798989295959474, + "rewards/env_game_reward/mean": 1.2249999523162842, + "rewards/env_game_reward/std": 0.2860778242349625, + "sampling/importance_sampling_ratio/max": 1.3862907409667968, + "sampling/importance_sampling_ratio/mean": 1.022992479801178, + "sampling/importance_sampling_ratio/min": 0.6777822971343994, + "sampling/sampling_logp_difference/max": 0.46881517171859743, + "sampling/sampling_logp_difference/mean": 0.009866528119891882, + "step": 465, + "step_time": 3.5962128964005386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1073.0, + "completions/max_terminated_length": 1073.0, + "completions/mean_length": 826.9875, + "completions/mean_terminated_length": 826.9875, + "completions/min_length": 632.0, + "completions/min_terminated_length": 632.0, + "entropy": 0.06233834847807884, + "epoch": 0.0376, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.034234557300806046, + "kl": 0.7558440536260604, + "learning_rate": 9.948169231340603e-06, + "loss": 5.739857442677021e-06, + "num_tokens": 7651407.0, + "reward": 1.246874952316284, + "reward_std": 0.19180271327495574, + "rewards/env_game_reward/mean": 1.246874952316284, + "rewards/env_game_reward/std": 0.30952975153923035, + "sampling/importance_sampling_ratio/max": 1.133919596672058, + "sampling/importance_sampling_ratio/mean": 0.9887906551361084, + "sampling/importance_sampling_ratio/min": 0.6729271888732911, + "sampling/sampling_logp_difference/max": 0.4064432859420776, + "sampling/sampling_logp_difference/mean": 0.006795862503349781, + "step": 470, + "step_time": 3.6208750205994873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.2, + "completions/max_terminated_length": 1049.2, + "completions/mean_length": 803.9625, + "completions/mean_terminated_length": 803.9625, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.04374380446970463, + "epoch": 0.038, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.03510225564241409, + "kl": 0.8806528657674789, + "learning_rate": 9.948111979714584e-06, + "loss": -3.6415911745280026e-05, + "num_tokens": 7758750.0, + "reward": 1.2643749475479127, + "reward_std": 0.12993087470531464, + "rewards/env_game_reward/mean": 1.2643749475479127, + "rewards/env_game_reward/std": 0.2548509627580643, + "sampling/importance_sampling_ratio/max": 1.0943125486373901, + "sampling/importance_sampling_ratio/mean": 0.9773291110992431, + "sampling/importance_sampling_ratio/min": 0.6454042971134186, + "sampling/sampling_logp_difference/max": 0.5180009245872498, + "sampling/sampling_logp_difference/mean": 0.00764626101590693, + "step": 475, + "step_time": 3.543976286398538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0001736111124046147, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0001736111124046147, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.4, + "completions/max_terminated_length": 1023.4, + "completions/mean_length": 789.3625, + "completions/mean_terminated_length": 789.3625, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.07270487509667874, + "epoch": 0.0384, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02017202600836754, + "kl": 1.002439457178116, + "learning_rate": 9.948054072583232e-06, + "loss": 0.0005217269062995911, + "num_tokens": 7864541.0, + "reward": 1.2366249322891236, + "reward_std": 0.1935704916715622, + "rewards/env_game_reward/mean": 1.2366249322891236, + "rewards/env_game_reward/std": 0.23191870152950286, + "sampling/importance_sampling_ratio/max": 1.3867063045501709, + "sampling/importance_sampling_ratio/mean": 1.0338881731033325, + "sampling/importance_sampling_ratio/min": 0.6520328521747821, + "sampling/sampling_logp_difference/max": 4.570292866230011, + "sampling/sampling_logp_difference/mean": 0.015752241667360067, + "step": 480, + "step_time": 3.5510043223992396 + }, + { + "clip_ratio/high_max": 0.003125, + "clip_ratio/high_mean": 0.0015625, + "clip_ratio/low_mean": 0.0014705882407724858, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003033088240772486, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.8, + "completions/max_terminated_length": 1023.8, + "completions/mean_length": 805.8, + "completions/mean_terminated_length": 805.8, + "completions/min_length": 610.2, + "completions/min_terminated_length": 610.2, + "entropy": 0.053855656459927556, + "epoch": 0.0388, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.06707289814949036, + "kl": 0.8074522137641906, + "learning_rate": 9.947995509956728e-06, + "loss": 0.0006144857034087181, + "num_tokens": 7972352.0, + "reward": 1.2990000009536744, + "reward_std": 0.09333809614181518, + "rewards/env_game_reward/mean": 1.2990000009536744, + "rewards/env_game_reward/std": 0.21436608731746673, + "sampling/importance_sampling_ratio/max": 1.468837523460388, + "sampling/importance_sampling_ratio/mean": 1.0262328863143921, + "sampling/importance_sampling_ratio/min": 0.7864007472991943, + "sampling/sampling_logp_difference/max": 0.3819884166121483, + "sampling/sampling_logp_difference/mean": 0.006758352974429727, + "step": 485, + "step_time": 3.4237150788008877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0027863777242600916, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027863777242600916, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1048.2, + "completions/max_terminated_length": 1048.2, + "completions/mean_length": 811.15, + "completions/mean_terminated_length": 811.15, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "entropy": 0.07090941481292248, + "epoch": 0.0392, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.028884613886475563, + "kl": 0.9331954538822174, + "learning_rate": 9.947936291845363e-06, + "loss": -0.00021723993122577668, + "num_tokens": 8080617.0, + "reward": 1.2640000104904174, + "reward_std": 0.1552099421620369, + "rewards/env_game_reward/mean": 1.2640000104904174, + "rewards/env_game_reward/std": 0.22860259115695952, + "sampling/importance_sampling_ratio/max": 1.3268755435943604, + "sampling/importance_sampling_ratio/mean": 0.9921948909759521, + "sampling/importance_sampling_ratio/min": 0.7053830564022064, + "sampling/sampling_logp_difference/max": 0.5805891335010529, + "sampling/sampling_logp_difference/mean": 0.011148639302700759, + "step": 490, + "step_time": 3.587002480799856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0014285714365541934, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014285714365541934, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 826.6375, + "completions/mean_terminated_length": 826.6375, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.07093169055879116, + "epoch": 0.0396, + "frac_reward_zero_std": 0.45, + "grad_norm": 0.038124196231365204, + "kl": 0.8121790528297425, + "learning_rate": 9.94787641825955e-06, + "loss": 0.00019427400548011065, + "num_tokens": 8190006.0, + "reward": 1.2337499618530274, + "reward_std": 0.21036426872015, + "rewards/env_game_reward/mean": 1.2337499618530274, + "rewards/env_game_reward/std": 0.29140671491622927, + "sampling/importance_sampling_ratio/max": 1.1874067783355713, + "sampling/importance_sampling_ratio/mean": 0.9822077751159668, + "sampling/importance_sampling_ratio/min": 0.6450884461402893, + "sampling/sampling_logp_difference/max": 0.5654253602027893, + "sampling/sampling_logp_difference/mean": 0.009776360169053078, + "step": 495, + "step_time": 3.513944055599859 + }, + { + "clip_ratio/high_max": 0.003125, + "clip_ratio/high_mean": 0.0015625, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0015625, + "completions/clipped_ratio": 0.0, + "completions/max_length": 950.0, + "completions/max_terminated_length": 950.0, + "completions/mean_length": 796.3625, + "completions/mean_terminated_length": 796.3625, + "completions/min_length": 630.6, + "completions/min_terminated_length": 630.6, + "entropy": 0.06923086419701577, + "epoch": 0.04, + "frac_reward_zero_std": 0.575, + "grad_norm": 0.04387466236948967, + "kl": 0.8395125925540924, + "learning_rate": 9.947815889209812e-06, + "loss": 0.00012775636278092862, + "num_tokens": 8295385.0, + "reward": 1.2642499923706054, + "reward_std": 0.12975409924983977, + "rewards/env_game_reward/mean": 1.2642499923706054, + "rewards/env_game_reward/std": 0.23409587144851685, + "sampling/importance_sampling_ratio/max": 1.3154429912567138, + "sampling/importance_sampling_ratio/mean": 1.000835919380188, + "sampling/importance_sampling_ratio/min": 0.6876768827438354, + "sampling/sampling_logp_difference/max": 0.45278497934341433, + "sampling/sampling_logp_difference/mean": 0.007802166696637869, + "step": 500, + "step_time": 3.2585985789992264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1178.4, + "completions/max_terminated_length": 1178.4, + "completions/mean_length": 906.1625, + "completions/mean_terminated_length": 906.1625, + "completions/min_length": 686.0, + "completions/min_terminated_length": 686.0, + "entropy": 0.07725062370300292, + "epoch": 0.0404, + "frac_reward_zero_std": 0.475, + "grad_norm": 0.08721896260976791, + "kl": 0.8332188785076141, + "learning_rate": 9.947754704706791e-06, + "loss": -0.0001865808852016926, + "num_tokens": 8411094.0, + "reward": 1.1867499947547913, + "reward_std": 0.21460690796375276, + "rewards/env_game_reward/mean": 1.1867499947547913, + "rewards/env_game_reward/std": 0.3215657353401184, + "sampling/importance_sampling_ratio/max": 1.3392818450927735, + "sampling/importance_sampling_ratio/mean": 1.0001285552978516, + "sampling/importance_sampling_ratio/min": 0.7314820289611816, + "sampling/sampling_logp_difference/max": 0.4349968358874321, + "sampling/sampling_logp_difference/mean": 0.010020055808126926, + "step": 505, + "step_time": 3.8816107694001403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0011363636702299118, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011363636702299118, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1330.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 1020.9625, + "completions/mean_terminated_length": 1020.9625, + "completions/min_length": 758.6, + "completions/min_terminated_length": 758.6, + "entropy": 0.060335731133818626, + "epoch": 0.0408, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02782941795885563, + "kl": 0.7987826824188232, + "learning_rate": 9.947692864761243e-06, + "loss": -0.00011843224056065082, + "num_tokens": 8535366.0, + "reward": 1.3006249904632567, + "reward_std": 0.17589280307292937, + "rewards/env_game_reward/mean": 1.3006249904632567, + "rewards/env_game_reward/std": 0.27952331900596616, + "sampling/importance_sampling_ratio/max": 1.2727408409118652, + "sampling/importance_sampling_ratio/mean": 1.0021348953247071, + "sampling/importance_sampling_ratio/min": 0.6616379976272583, + "sampling/sampling_logp_difference/max": 0.705156409740448, + "sampling/sampling_logp_difference/mean": 0.010359429102391005, + "step": 510, + "step_time": 4.54856540679757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0011363636702299118, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011363636702299118, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1332.6, + "completions/max_terminated_length": 1332.6, + "completions/mean_length": 1048.3875, + "completions/mean_terminated_length": 1048.3875, + "completions/min_length": 760.6, + "completions/min_terminated_length": 760.6, + "entropy": 0.08999732621014118, + "epoch": 0.0412, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.05762872472405434, + "kl": 0.8527314305305481, + "learning_rate": 9.947630369384036e-06, + "loss": 0.0009840115904808044, + "num_tokens": 8662752.0, + "reward": 1.2363749980926513, + "reward_std": 0.20240932703018188, + "rewards/env_game_reward/mean": 1.2363749980926513, + "rewards/env_game_reward/std": 0.2678327292203903, + "sampling/importance_sampling_ratio/max": 1.7645922422409057, + "sampling/importance_sampling_ratio/mean": 1.0444591283798217, + "sampling/importance_sampling_ratio/min": 0.6512416243553162, + "sampling/sampling_logp_difference/max": 0.5892035603523255, + "sampling/sampling_logp_difference/mean": 0.01366331558674574, + "step": 515, + "step_time": 4.4831126509998285 + }, + { + "clip_ratio/high_max": 0.0024390242993831634, + "clip_ratio/high_mean": 0.0012195121496915817, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0012195121496915817, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.8, + "completions/max_terminated_length": 1366.8, + "completions/mean_length": 1012.5875, + "completions/mean_terminated_length": 1012.5875, + "completions/min_length": 748.8, + "completions/min_terminated_length": 748.8, + "entropy": 0.06842279564589263, + "epoch": 0.0416, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.051058683544397354, + "kl": 0.956644493341446, + "learning_rate": 9.947567218586157e-06, + "loss": 0.0002877896185964346, + "num_tokens": 8786459.0, + "reward": 1.2256250619888305, + "reward_std": 0.20735905468463897, + "rewards/env_game_reward/mean": 1.2256250619888305, + "rewards/env_game_reward/std": 0.2896923661231995, + "sampling/importance_sampling_ratio/max": 1.444146990776062, + "sampling/importance_sampling_ratio/mean": 0.9814066886901855, + "sampling/importance_sampling_ratio/min": 0.6000928044319153, + "sampling/sampling_logp_difference/max": 0.5186550855636597, + "sampling/sampling_logp_difference/mean": 0.012273849081248045, + "step": 520, + "step_time": 4.551891402601177 + }, + { + "clip_ratio/high_max": 0.0023255813866853714, + "clip_ratio/high_mean": 0.0011627906933426857, + "clip_ratio/low_mean": 0.0023532669059932233, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003516057599335909, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1362.2, + "completions/max_terminated_length": 1362.2, + "completions/mean_length": 1080.0125, + "completions/mean_terminated_length": 1080.0125, + "completions/min_length": 831.4, + "completions/min_terminated_length": 831.4, + "entropy": 0.054267326928675176, + "epoch": 0.042, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.016077129170298576, + "kl": 0.9028543293476105, + "learning_rate": 9.94750341237871e-06, + "loss": -0.001446277927607298, + "num_tokens": 8917463.0, + "reward": 1.2988749742507935, + "reward_std": 0.21867277324199677, + "rewards/env_game_reward/mean": 1.2988749742507935, + "rewards/env_game_reward/std": 0.2746602237224579, + "sampling/importance_sampling_ratio/max": 1.3581011295318604, + "sampling/importance_sampling_ratio/mean": 0.8896560549736023, + "sampling/importance_sampling_ratio/min": 0.29013479351997373, + "sampling/sampling_logp_difference/max": 0.9900324821472168, + "sampling/sampling_logp_difference/mean": 0.024117074348032473, + "step": 525, + "step_time": 4.540563870201003 + }, + { + "epoch": 0.042, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1346.6666666666667, + "eval_completions/max_terminated_length": 1346.6666666666667, + "eval_completions/mean_length": 1164.4583333333333, + "eval_completions/mean_terminated_length": 1164.4583333333333, + "eval_completions/min_length": 842.0, + "eval_completions/min_terminated_length": 842.0, + "eval_entropy": 0.05087510993083318, + "eval_frac_reward_zero_std": 0.4166666666666667, + "eval_kl": 0.9046386082967123, + "eval_loss": -0.00028693454805761576, + "eval_num_tokens": 8917463.0, + "eval_reward": 1.2712500095367432, + "eval_reward_std": 0.17736596086372933, + "eval_rewards/env_game_reward/mean": 1.2712500095367432, + "eval_rewards/env_game_reward/std": 0.3256373902161916, + "eval_runtime": 4.9449, + "eval_samples_per_second": 2.022, + "eval_sampling/importance_sampling_ratio/max": 1.1742435693740845, + "eval_sampling/importance_sampling_ratio/mean": 0.8475368817647299, + "eval_sampling/importance_sampling_ratio/min": 0.048990381260712944, + "eval_sampling/sampling_logp_difference/max": 1.8321036497751872, + "eval_sampling/sampling_logp_difference/mean": 0.03830909232298533, + "eval_steps_per_second": 0.404, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0021739130839705466, + "clip_ratio/high_mean": 0.0010869565419852733, + "clip_ratio/low_mean": 0.00701402323320508, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.008100979775190354, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1396.8, + "completions/max_terminated_length": 1396.8, + "completions/mean_length": 1032.575, + "completions/mean_terminated_length": 1032.575, + "completions/min_length": 734.8, + "completions/min_terminated_length": 734.8, + "entropy": 0.04630528762936592, + "epoch": 0.0424, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.4360601305961609, + "kl": 2.5725530982017517, + "learning_rate": 9.947438950772907e-06, + "loss": 0.00011557291727513075, + "num_tokens": 9043383.0, + "reward": 1.2701250076293946, + "reward_std": 0.16139711886644365, + "rewards/env_game_reward/mean": 1.2701250076293946, + "rewards/env_game_reward/std": 0.26454554200172425, + "sampling/importance_sampling_ratio/max": 1.2565020084381104, + "sampling/importance_sampling_ratio/mean": 0.8415638923645019, + "sampling/importance_sampling_ratio/min": 0.1712031990289688, + "sampling/sampling_logp_difference/max": 1.4327976703643799, + "sampling/sampling_logp_difference/mean": 0.032391348481178285, + "step": 530, + "step_time": 4.706439789199067 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0011627906933426857, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011627906933426857, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.2, + "completions/max_terminated_length": 1335.2, + "completions/mean_length": 1041.975, + "completions/mean_terminated_length": 1041.975, + "completions/min_length": 738.8, + "completions/min_terminated_length": 738.8, + "entropy": 0.11737375222146511, + "epoch": 0.0428, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.03724990040063858, + "kl": 2.714300978183746, + "learning_rate": 9.947373833780082e-06, + "loss": 0.0008453769609332084, + "num_tokens": 9170328.0, + "reward": 1.2677500009536744, + "reward_std": 0.16581654138863086, + "rewards/env_game_reward/mean": 1.2677500009536744, + "rewards/env_game_reward/std": 0.273353236913681, + "sampling/importance_sampling_ratio/max": 1.6070437669754027, + "sampling/importance_sampling_ratio/mean": 0.9828642964363098, + "sampling/importance_sampling_ratio/min": 0.4266847729682922, + "sampling/sampling_logp_difference/max": 1.137272548675537, + "sampling/sampling_logp_difference/mean": 0.020196602493524552, + "step": 535, + "step_time": 4.443584994000412 + }, + { + "clip_ratio/high_max": 0.0024390242993831634, + "clip_ratio/high_mean": 0.0012195121496915817, + "clip_ratio/low_mean": 0.0022543059661984445, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003473818115890026, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1329.4, + "completions/max_terminated_length": 1329.4, + "completions/mean_length": 1017.975, + "completions/mean_terminated_length": 1017.975, + "completions/min_length": 751.2, + "completions/min_terminated_length": 751.2, + "entropy": 0.2812275648117065, + "epoch": 0.0432, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.07084682583808899, + "kl": 0.9571394979953766, + "learning_rate": 9.947308061411678e-06, + "loss": 0.0005366875790059567, + "num_tokens": 9294399.0, + "reward": 0.9989999771118164, + "reward_std": 0.4200214326381683, + "rewards/env_game_reward/mean": 0.9989999771118164, + "rewards/env_game_reward/std": 0.4420970261096954, + "sampling/importance_sampling_ratio/max": 1.6333736658096314, + "sampling/importance_sampling_ratio/mean": 1.0077808022499084, + "sampling/importance_sampling_ratio/min": 0.6362493216991425, + "sampling/sampling_logp_difference/max": 0.6116647005081177, + "sampling/sampling_logp_difference/mean": 0.02449062168598175, + "step": 540, + "step_time": 4.512151982798969 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0010638297535479069, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010638297535479069, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1399.6, + "completions/max_terminated_length": 1399.6, + "completions/mean_length": 1103.425, + "completions/mean_terminated_length": 1103.425, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "entropy": 0.31689401865005495, + "epoch": 0.0436, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.05505409464240074, + "kl": 1.0262145280838013, + "learning_rate": 9.947241633679265e-06, + "loss": 0.0009896579198539258, + "num_tokens": 9427316.0, + "reward": 1.0178749799728393, + "reward_std": 0.3282743275165558, + "rewards/env_game_reward/mean": 1.0178749799728393, + "rewards/env_game_reward/std": 0.4406755268573761, + "sampling/importance_sampling_ratio/max": 1.4536394834518434, + "sampling/importance_sampling_ratio/mean": 1.0220404386520385, + "sampling/importance_sampling_ratio/min": 0.7271527051925659, + "sampling/sampling_logp_difference/max": 0.3957766056060791, + "sampling/sampling_logp_difference/mean": 0.017512153089046478, + "step": 545, + "step_time": 4.728214942799968 + }, + { + "clip_ratio/high_max": 0.002500000037252903, + "clip_ratio/high_mean": 0.0012500000186264515, + "clip_ratio/low_mean": 0.0010638297535479069, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002313829772174358, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1364.8, + "completions/max_terminated_length": 1364.8, + "completions/mean_length": 1103.85, + "completions/mean_terminated_length": 1103.85, + "completions/min_length": 813.0, + "completions/min_terminated_length": 813.0, + "entropy": 0.2996835097670555, + "epoch": 0.044, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.05984482169151306, + "kl": 1.0265156745910644, + "learning_rate": 9.947174550594512e-06, + "loss": -0.0004418407566845417, + "num_tokens": 9559899.0, + "reward": 1.1202500104904174, + "reward_std": 0.35602826476097105, + "rewards/env_game_reward/mean": 1.1202500104904174, + "rewards/env_game_reward/std": 0.3752091586589813, + "sampling/importance_sampling_ratio/max": 1.2323621749877929, + "sampling/importance_sampling_ratio/mean": 0.9733627676963806, + "sampling/importance_sampling_ratio/min": 0.5377955138683319, + "sampling/sampling_logp_difference/max": 0.519776713848114, + "sampling/sampling_logp_difference/mean": 0.019513449072837828, + "step": 550, + "step_time": 4.6486869933993145 + }, + { + "clip_ratio/high_max": 0.0023255813866853714, + "clip_ratio/high_mean": 0.0011627906933426857, + "clip_ratio/low_mean": 0.002273901831358671, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034366926178336144, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1365.4, + "completions/max_terminated_length": 1365.4, + "completions/mean_length": 1047.0375, + "completions/mean_terminated_length": 1047.0375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "entropy": 0.22820044606924056, + "epoch": 0.0444, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.13629508018493652, + "kl": 1.2704419553279878, + "learning_rate": 9.947106812169217e-06, + "loss": 0.0005231309216469527, + "num_tokens": 9687189.0, + "reward": 1.0570000410079956, + "reward_std": 0.2750645339488983, + "rewards/env_game_reward/mean": 1.0570000410079956, + "rewards/env_game_reward/std": 0.41252206563949584, + "sampling/importance_sampling_ratio/max": 1.3011247158050536, + "sampling/importance_sampling_ratio/mean": 0.997124445438385, + "sampling/importance_sampling_ratio/min": 0.6755926370620727, + "sampling/sampling_logp_difference/max": 0.3411633610725403, + "sampling/sampling_logp_difference/mean": 0.016007083281874657, + "step": 555, + "step_time": 4.4430775420005375 + }, + { + "clip_ratio/high_max": 0.004764605686068535, + "clip_ratio/high_mean": 0.0023823028430342676, + "clip_ratio/low_mean": 0.0022474748082458975, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004629777651280165, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1295.6, + "completions/max_terminated_length": 1295.6, + "completions/mean_length": 1009.375, + "completions/mean_terminated_length": 1009.375, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "entropy": 0.189104101061821, + "epoch": 0.0448, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.03939010575413704, + "kl": 1.5311454951763153, + "learning_rate": 9.947038418415283e-06, + "loss": 0.0001573776826262474, + "num_tokens": 9811161.0, + "reward": 1.1691249847412108, + "reward_std": 0.21195527017116547, + "rewards/env_game_reward/mean": 1.1691249847412108, + "rewards/env_game_reward/std": 0.30571516156196593, + "sampling/importance_sampling_ratio/max": 1.4245776176452636, + "sampling/importance_sampling_ratio/mean": 0.9870476484298706, + "sampling/importance_sampling_ratio/min": 0.5857455730438232, + "sampling/sampling_logp_difference/max": 0.5550769805908203, + "sampling/sampling_logp_difference/mean": 0.016982442513108254, + "step": 560, + "step_time": 4.34924570279909 + }, + { + "clip_ratio/high_max": 0.002222222276031971, + "clip_ratio/high_mean": 0.0011111111380159855, + "clip_ratio/low_mean": 0.0022991543635725977, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003410265501588583, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.8, + "completions/max_terminated_length": 1266.8, + "completions/mean_length": 1015.125, + "completions/mean_terminated_length": 1015.125, + "completions/min_length": 774.0, + "completions/min_terminated_length": 774.0, + "entropy": 0.17396872639656066, + "epoch": 0.0452, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.0472419410943985, + "kl": 1.4365731865167617, + "learning_rate": 9.946969369344737e-06, + "loss": 0.00030971074011176825, + "num_tokens": 9935273.0, + "reward": 1.1539999842643738, + "reward_std": 0.28637823164463044, + "rewards/env_game_reward/mean": 1.1539999842643738, + "rewards/env_game_reward/std": 0.3490210175514221, + "sampling/importance_sampling_ratio/max": 1.4732376098632813, + "sampling/importance_sampling_ratio/mean": 0.9988004565238953, + "sampling/importance_sampling_ratio/min": 0.72213294506073, + "sampling/sampling_logp_difference/max": 0.4380834102630615, + "sampling/sampling_logp_difference/mean": 0.011317184008657933, + "step": 565, + "step_time": 4.254104064800049 + }, + { + "clip_ratio/high_max": 0.002222222276031971, + "clip_ratio/high_mean": 0.0011111111380159855, + "clip_ratio/low_mean": 0.0034126984886825086, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004523809626698494, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.8, + "completions/max_terminated_length": 1366.8, + "completions/mean_length": 1055.375, + "completions/mean_terminated_length": 1055.375, + "completions/min_length": 764.0, + "completions/min_terminated_length": 764.0, + "entropy": 0.16300990134477616, + "epoch": 0.0456, + "frac_reward_zero_std": 0.325, + "grad_norm": 0.05742908641695976, + "kl": 1.029376059770584, + "learning_rate": 9.946899664969714e-06, + "loss": 0.0016465794295072555, + "num_tokens": 10062425.0, + "reward": 1.2022499561309814, + "reward_std": 0.23970919847488403, + "rewards/env_game_reward/mean": 1.2022499561309814, + "rewards/env_game_reward/std": 0.3561276257038116, + "sampling/importance_sampling_ratio/max": 1.472624158859253, + "sampling/importance_sampling_ratio/mean": 1.0107559204101562, + "sampling/importance_sampling_ratio/min": 0.7311721444129944, + "sampling/sampling_logp_difference/max": 0.49366533756256104, + "sampling/sampling_logp_difference/mean": 0.012838244996964931, + "step": 570, + "step_time": 4.771000784600619 + }, + { + "clip_ratio/high_max": 0.002380952425301075, + "clip_ratio/high_mean": 0.0011904762126505376, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011904762126505376, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1361.2, + "completions/max_terminated_length": 1361.2, + "completions/mean_length": 1047.8125, + "completions/mean_terminated_length": 1047.8125, + "completions/min_length": 716.4, + "completions/min_terminated_length": 716.4, + "entropy": 0.17721292078495027, + "epoch": 0.046, + "frac_reward_zero_std": 0.35, + "grad_norm": 0.07721905410289764, + "kl": 1.2694499015808105, + "learning_rate": 9.946829305302469e-06, + "loss": 0.00022875519935041665, + "num_tokens": 10189363.0, + "reward": 1.1711250066757202, + "reward_std": 0.2282187223434448, + "rewards/env_game_reward/mean": 1.1711250066757202, + "rewards/env_game_reward/std": 0.38280072808265686, + "sampling/importance_sampling_ratio/max": 1.3734643936157227, + "sampling/importance_sampling_ratio/mean": 0.9969516515731811, + "sampling/importance_sampling_ratio/min": 0.5975641906261444, + "sampling/sampling_logp_difference/max": 0.6332993030548095, + "sampling/sampling_logp_difference/mean": 0.014802565798163414, + "step": 575, + "step_time": 5.061972107600741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1397.4, + "completions/max_terminated_length": 1397.4, + "completions/mean_length": 1023.0375, + "completions/mean_terminated_length": 1023.0375, + "completions/min_length": 725.2, + "completions/min_terminated_length": 725.2, + "entropy": 0.15077468156814575, + "epoch": 0.0464, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.034438736736774445, + "kl": 0.9025042593479157, + "learning_rate": 9.946758290355367e-06, + "loss": 0.0001244666986167431, + "num_tokens": 10313827.0, + "reward": 1.2078750133514404, + "reward_std": 0.283373036980629, + "rewards/env_game_reward/mean": 1.2078750133514404, + "rewards/env_game_reward/std": 0.3918757438659668, + "sampling/importance_sampling_ratio/max": 1.2872359037399292, + "sampling/importance_sampling_ratio/mean": 0.991502559185028, + "sampling/importance_sampling_ratio/min": 0.6531059622764588, + "sampling/sampling_logp_difference/max": 0.43392740488052367, + "sampling/sampling_logp_difference/mean": 0.011175908334553241, + "step": 580, + "step_time": 4.977742930199019 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0022774327546358107, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0022774327546358107, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1395.6, + "completions/max_terminated_length": 1395.6, + "completions/mean_length": 1072.8625, + "completions/mean_terminated_length": 1072.8625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 0.1566304475069046, + "epoch": 0.0468, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.043017007410526276, + "kl": 1.0742876887321473, + "learning_rate": 9.946686620140896e-06, + "loss": -0.0001473414245992899, + "num_tokens": 10443641.0, + "reward": 1.252625036239624, + "reward_std": 0.18862073719501496, + "rewards/env_game_reward/mean": 1.252625036239624, + "rewards/env_game_reward/std": 0.31137059032917025, + "sampling/importance_sampling_ratio/max": 1.440973162651062, + "sampling/importance_sampling_ratio/mean": 0.9900519251823425, + "sampling/importance_sampling_ratio/min": 0.47393553853034975, + "sampling/sampling_logp_difference/max": 0.7416575908660888, + "sampling/sampling_logp_difference/mean": 0.017456328868865965, + "step": 585, + "step_time": 5.251834656600113 + }, + { + "epoch": 0.04712, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1344.6666666666667, + "eval_completions/max_terminated_length": 1344.6666666666667, + "eval_completions/mean_length": 1168.0416666666667, + "eval_completions/mean_terminated_length": 1168.0416666666667, + "eval_completions/min_length": 862.3333333333334, + "eval_completions/min_terminated_length": 862.3333333333334, + "eval_entropy": 0.10806348671515782, + "eval_frac_reward_zero_std": 0.5, + "eval_kl": 0.8868140776952108, + "eval_loss": 0.0005803019157610834, + "eval_num_tokens": 10555310.0, + "eval_reward": 1.3479166825612385, + "eval_reward_std": 0.1431891197959582, + "eval_rewards/env_game_reward/mean": 1.3479166825612385, + "eval_rewards/env_game_reward/std": 0.2515455484390259, + "eval_runtime": 5.5745, + "eval_samples_per_second": 1.794, + "eval_sampling/importance_sampling_ratio/max": 1.4131007194519043, + "eval_sampling/importance_sampling_ratio/mean": 1.034290115038554, + "eval_sampling/importance_sampling_ratio/min": 0.6967911720275879, + "eval_sampling/sampling_logp_difference/max": 0.37542269627253216, + "eval_sampling/sampling_logp_difference/mean": 0.01116576852897803, + "eval_steps_per_second": 0.359, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.004533906187862158, + "clip_ratio/low_min": 0.0022727273404598235, + "clip_ratio/region_mean": 0.004533906187862158, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1399.4, + "completions/max_terminated_length": 1399.4, + "completions/mean_length": 1164.425, + "completions/mean_terminated_length": 1164.425, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "entropy": 0.12822237834334374, + "epoch": 0.0472, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.036244940012693405, + "kl": 1.0616086661815642, + "learning_rate": 9.946614294671653e-06, + "loss": 9.502761531621218e-06, + "num_tokens": 10581987.0, + "reward": 1.3098750352859496, + "reward_std": 0.17058949768543244, + "rewards/env_game_reward/mean": 1.3098750352859496, + "rewards/env_game_reward/std": 0.3004313975572586, + "sampling/importance_sampling_ratio/max": 1.4858418226242065, + "sampling/importance_sampling_ratio/mean": 1.0072325468063354, + "sampling/importance_sampling_ratio/min": 0.5586565196514129, + "sampling/sampling_logp_difference/max": 0.6491478204727172, + "sampling/sampling_logp_difference/mean": 0.015580065548419952, + "step": 590, + "step_time": 5.07703026540039 + }, + { + "clip_ratio/high_max": 0.0024390242993831634, + "clip_ratio/high_mean": 0.0012195121496915817, + "clip_ratio/low_mean": 0.0011904762126505376, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002409988362342119, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1362.0, + "completions/max_terminated_length": 1362.0, + "completions/mean_length": 1053.4625, + "completions/mean_terminated_length": 1053.4625, + "completions/min_length": 767.8, + "completions/min_terminated_length": 767.8, + "entropy": 0.1259395658969879, + "epoch": 0.0476, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.0859086737036705, + "kl": 1.2129336059093476, + "learning_rate": 9.94654131396035e-06, + "loss": -0.00022206176072359085, + "num_tokens": 10709974.0, + "reward": 1.2372500419616699, + "reward_std": 0.2305168092250824, + "rewards/env_game_reward/mean": 1.2372500419616699, + "rewards/env_game_reward/std": 0.31874315440654755, + "sampling/importance_sampling_ratio/max": 1.3722262382507324, + "sampling/importance_sampling_ratio/mean": 0.9773016810417176, + "sampling/importance_sampling_ratio/min": 0.5880869805812836, + "sampling/sampling_logp_difference/max": 0.6137372255325317, + "sampling/sampling_logp_difference/mean": 0.01840695794671774, + "step": 595, + "step_time": 4.8652321020003 + }, + { + "clip_ratio/high_max": 0.00467391312122345, + "clip_ratio/high_mean": 0.002336956560611725, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002336956560611725, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1369.0, + "completions/max_terminated_length": 1369.0, + "completions/mean_length": 1018.6375, + "completions/mean_terminated_length": 1018.6375, + "completions/min_length": 701.6, + "completions/min_terminated_length": 701.6, + "entropy": 0.12487654238939286, + "epoch": 0.048, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.09567244350910187, + "kl": 1.3309521913528441, + "learning_rate": 9.94646767801982e-06, + "loss": -0.0002675119787454605, + "num_tokens": 10833208.0, + "reward": 1.2700000047683715, + "reward_std": 0.2577404260635376, + "rewards/env_game_reward/mean": 1.2700000047683715, + "rewards/env_game_reward/std": 0.311552906036377, + "sampling/importance_sampling_ratio/max": 1.4984299182891845, + "sampling/importance_sampling_ratio/mean": 0.9919448137283325, + "sampling/importance_sampling_ratio/min": 0.5176741182804108, + "sampling/sampling_logp_difference/max": 0.763327705860138, + "sampling/sampling_logp_difference/mean": 0.01984752044081688, + "step": 600, + "step_time": 5.101835844999004 + }, + { + "epoch": 0.048, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1345.6666666666667, + "eval_completions/max_terminated_length": 1345.6666666666667, + "eval_completions/mean_length": 1157.8333333333333, + "eval_completions/mean_terminated_length": 1157.8333333333333, + "eval_completions/min_length": 839.0, + "eval_completions/min_terminated_length": 839.0, + "eval_entropy": 0.08671100437641144, + "eval_frac_reward_zero_std": 0.25, + "eval_kl": 0.9999842445055643, + "eval_loss": 0.000851858698297292, + "eval_num_tokens": 10833208.0, + "eval_reward": 1.3208333651224773, + "eval_reward_std": 0.14613540470600128, + "eval_rewards/env_game_reward/mean": 1.3208333651224773, + "eval_rewards/env_game_reward/std": 0.25870031118392944, + "eval_runtime": 5.5348, + "eval_samples_per_second": 1.807, + "eval_sampling/importance_sampling_ratio/max": 1.4152355988820393, + "eval_sampling/importance_sampling_ratio/mean": 1.0069709221522014, + "eval_sampling/importance_sampling_ratio/min": 0.7177760501702627, + "eval_sampling/sampling_logp_difference/max": 0.5611036419868469, + "eval_sampling/sampling_logp_difference/mean": 0.013044494514664015, + "eval_steps_per_second": 0.361, + "step": 600 + }, + { + "clip_ratio/high_max": 0.002380952425301075, + "clip_ratio/high_mean": 0.0011904762126505376, + "clip_ratio/low_mean": 0.002273901831358671, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0034643780440092088, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.8, + "completions/max_terminated_length": 1366.8, + "completions/mean_length": 1079.5875, + "completions/mean_terminated_length": 1079.5875, + "completions/min_length": 725.6, + "completions/min_terminated_length": 725.6, + "entropy": 0.0867956567555666, + "epoch": 0.0484, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.021401429548859596, + "kl": 0.9383753180503845, + "learning_rate": 9.946393386863004e-06, + "loss": 0.0008476153947412967, + "num_tokens": 10963503.0, + "reward": 1.3545000076293945, + "reward_std": 0.1400071382522583, + "rewards/env_game_reward/mean": 1.3545000076293945, + "rewards/env_game_reward/std": 0.22545891106128693, + "sampling/importance_sampling_ratio/max": 1.553718662261963, + "sampling/importance_sampling_ratio/mean": 1.0068870782852173, + "sampling/importance_sampling_ratio/min": 0.5401573002338409, + "sampling/sampling_logp_difference/max": 0.7244943857192994, + "sampling/sampling_logp_difference/mean": 0.013488991186022758, + "step": 605, + "step_time": 4.886114542601717 + }, + { + "clip_ratio/high_max": 0.002222222276031971, + "clip_ratio/high_mean": 0.0011111111380159855, + "clip_ratio/low_mean": 0.003518666513264179, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.004629777651280165, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1359.2, + "completions/max_terminated_length": 1359.2, + "completions/mean_length": 1043.95, + "completions/mean_terminated_length": 1043.95, + "completions/min_length": 776.8, + "completions/min_terminated_length": 776.8, + "entropy": 0.0908737700432539, + "epoch": 0.0488, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.017844822257757187, + "kl": 1.0020154118537903, + "learning_rate": 9.946318440502963e-06, + "loss": 0.00019409202504903079, + "num_tokens": 11090764.0, + "reward": 1.2816250324249268, + "reward_std": 0.19922733902931214, + "rewards/env_game_reward/mean": 1.2816250324249268, + "rewards/env_game_reward/std": 0.2757085144519806, + "sampling/importance_sampling_ratio/max": 1.3940518379211426, + "sampling/importance_sampling_ratio/mean": 0.9948145389556885, + "sampling/importance_sampling_ratio/min": 0.520936244726181, + "sampling/sampling_logp_difference/max": 0.7158478498458862, + "sampling/sampling_logp_difference/mean": 0.013564139045774937, + "step": 610, + "step_time": 5.003223898000579 + }, + { + "clip_ratio/high_max": 0.0022727273404598235, + "clip_ratio/high_mean": 0.0011363636702299118, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0011363636702299118, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1397.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 1096.8375, + "completions/mean_terminated_length": 1096.8375, + "completions/min_length": 775.6, + "completions/min_terminated_length": 775.6, + "entropy": 0.06849537193775176, + "epoch": 0.0492, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.3096023499965668, + "kl": 1.5637618243694305, + "learning_rate": 9.946242838952871e-06, + "loss": -0.00011666212230920792, + "num_tokens": 11222478.0, + "reward": 1.3425000190734864, + "reward_std": 0.1347038432955742, + "rewards/env_game_reward/mean": 1.3425000190734864, + "rewards/env_game_reward/std": 0.215749391913414, + "sampling/importance_sampling_ratio/max": 1.6003359794616698, + "sampling/importance_sampling_ratio/mean": 1.0017924785614014, + "sampling/importance_sampling_ratio/min": 0.6305740118026734, + "sampling/sampling_logp_difference/max": 0.5970579147338867, + "sampling/sampling_logp_difference/mean": 0.011274610366672277, + "step": 615, + "step_time": 4.919843460199627 + }, + { + "clip_ratio/high_max": 0.004651162773370743, + "clip_ratio/high_mean": 0.0023255813866853714, + "clip_ratio/low_mean": 0.004850464593619108, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00717604598030448, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1394.2, + "completions/max_terminated_length": 1394.2, + "completions/mean_length": 1084.875, + "completions/mean_terminated_length": 1084.875, + "completions/min_length": 701.6, + "completions/min_terminated_length": 701.6, + "entropy": 0.07974468879401683, + "epoch": 0.0496, + "frac_reward_zero_std": 0.4, + "grad_norm": 0.09348504990339279, + "kl": 1.5357508301734923, + "learning_rate": 9.946166582226018e-06, + "loss": 0.001522757112979889, + "num_tokens": 11354065.0, + "reward": 1.349375033378601, + "reward_std": 0.10235370472073554, + "rewards/env_game_reward/mean": 1.349375033378601, + "rewards/env_game_reward/std": 0.2258108526468277, + "sampling/importance_sampling_ratio/max": 1.5427998304367065, + "sampling/importance_sampling_ratio/mean": 0.9522350907325745, + "sampling/importance_sampling_ratio/min": 0.401968015730381, + "sampling/sampling_logp_difference/max": 1.563115644454956, + "sampling/sampling_logp_difference/mean": 0.021521523036062716, + "step": 620, + "step_time": 5.169165654201788 + }, + { + "clip_ratio/high_max": 0.002222222276031971, + "clip_ratio/high_mean": 0.0011111111380159855, + "clip_ratio/low_mean": 0.0022266204468905926, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.003337731584906578, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1394.0, + "completions/max_terminated_length": 1394.0, + "completions/mean_length": 1078.9875, + "completions/mean_terminated_length": 1078.9875, + "completions/min_length": 741.0, + "completions/min_terminated_length": 741.0, + "entropy": 0.08226897865533829, + "epoch": 0.05, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.03745371475815773, + "kl": 1.0471153795719146, + "learning_rate": 9.946089670335809e-06, + "loss": -4.6807329636067155e-05, + "num_tokens": 11484123.0, + "reward": 1.3264999866485596, + "reward_std": 0.15733125507831575, + "rewards/env_game_reward/mean": 1.3264999866485596, + "rewards/env_game_reward/std": 0.248744997382164, + "sampling/importance_sampling_ratio/max": 1.4238209962844848, + "sampling/importance_sampling_ratio/mean": 0.9809260606765747, + "sampling/importance_sampling_ratio/min": 0.6000564575195313, + "sampling/sampling_logp_difference/max": 0.7538355946540832, + "sampling/sampling_logp_difference/mean": 0.012904992513358592, + "step": 625, + "step_time": 5.083012864999182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0010204081423580646, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010204081423580646, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1500.2, + "completions/max_terminated_length": 1500.2, + "completions/mean_length": 1179.3125, + "completions/mean_terminated_length": 1179.3125, + "completions/min_length": 850.6, + "completions/min_terminated_length": 850.6, + "entropy": 0.12509409189224244, + "epoch": 0.0504, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.04384912550449371, + "kl": 1.6370096325874328, + "learning_rate": 9.946012103295763e-06, + "loss": 0.00025049857795238494, + "num_tokens": 11621524.0, + "reward": 1.265916681289673, + "reward_std": 0.2652829051017761, + "rewards/env_game_reward/mean": 1.265916681289673, + "rewards/env_game_reward/std": 0.38448314666748046, + "sampling/importance_sampling_ratio/max": 1.7073483228683473, + "sampling/importance_sampling_ratio/mean": 0.9963474154472352, + "sampling/importance_sampling_ratio/min": 0.4732287287712097, + "sampling/sampling_logp_difference/max": 0.8960028886795044, + "sampling/sampling_logp_difference/mean": 0.02103390172123909, + "step": 630, + "step_time": 5.693049415801215 + }, + { + "clip_ratio/high_max": 0.004001600667834282, + "clip_ratio/high_mean": 0.002000800333917141, + "clip_ratio/low_mean": 0.0038125600665807726, + "clip_ratio/low_min": 0.00181818176060915, + "clip_ratio/region_mean": 0.0058133604004979135, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1703.2, + "completions/max_terminated_length": 1703.2, + "completions/mean_length": 1235.5875, + "completions/mean_terminated_length": 1235.5875, + "completions/min_length": 869.0, + "completions/min_terminated_length": 869.0, + "entropy": 0.13607805520296096, + "epoch": 0.0508, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.07780463993549347, + "kl": 1.3788925766944886, + "learning_rate": 9.94593388111952e-06, + "loss": 0.0022235548123717306, + "num_tokens": 11763516.0, + "reward": 1.2385416746139526, + "reward_std": 0.3068254292011261, + "rewards/env_game_reward/mean": 1.2385416746139526, + "rewards/env_game_reward/std": 0.3703270435333252, + "sampling/importance_sampling_ratio/max": 1.830536437034607, + "sampling/importance_sampling_ratio/mean": 1.037343180179596, + "sampling/importance_sampling_ratio/min": 0.6153202593326569, + "sampling/sampling_logp_difference/max": 0.6751439929008484, + "sampling/sampling_logp_difference/mean": 0.018885864317417143, + "step": 635, + "step_time": 6.099398655800906 + }, + { + "clip_ratio/high_max": 0.003704974241554737, + "clip_ratio/high_mean": 0.0018524871207773685, + "clip_ratio/low_mean": 0.0010416666977107526, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002894153818488121, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1667.2, + "completions/max_terminated_length": 1667.2, + "completions/mean_length": 1243.275, + "completions/mean_terminated_length": 1243.275, + "completions/min_length": 862.8, + "completions/min_terminated_length": 862.8, + "entropy": 0.14713116511702537, + "epoch": 0.0512, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.06170591339468956, + "kl": 1.3155274748802186, + "learning_rate": 9.945855003820824e-06, + "loss": 0.0015274440869688989, + "num_tokens": 11905864.0, + "reward": 1.2326250314712524, + "reward_std": 0.2401216834783554, + "rewards/env_game_reward/mean": 1.2326250314712524, + "rewards/env_game_reward/std": 0.33104810416698455, + "sampling/importance_sampling_ratio/max": 1.4255517959594726, + "sampling/importance_sampling_ratio/mean": 0.9756847858428955, + "sampling/importance_sampling_ratio/min": 0.591080230474472, + "sampling/sampling_logp_difference/max": 0.5704588890075684, + "sampling/sampling_logp_difference/mean": 0.015524715185165405, + "step": 640, + "step_time": 6.075532784599636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0019615384750068187, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019615384750068187, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1701.2, + "completions/max_terminated_length": 1701.2, + "completions/mean_length": 1271.575, + "completions/mean_terminated_length": 1271.575, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "entropy": 0.1372353859245777, + "epoch": 0.0516, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.051711492240428925, + "kl": 1.1370346248149872, + "learning_rate": 9.945775471413545e-06, + "loss": -0.00048790322616696356, + "num_tokens": 12050609.0, + "reward": 1.2797083377838134, + "reward_std": 0.25072828233242034, + "rewards/env_game_reward/mean": 1.2797083377838134, + "rewards/env_game_reward/std": 0.34153226017951965, + "sampling/importance_sampling_ratio/max": 1.3425079822540282, + "sampling/importance_sampling_ratio/mean": 0.9526666402816772, + "sampling/importance_sampling_ratio/min": 0.6094960927963257, + "sampling/sampling_logp_difference/max": 0.49349377155303953, + "sampling/sampling_logp_difference/mean": 0.013823052123188972, + "step": 645, + "step_time": 5.696355382000911 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.001963804382830858, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.001963804382830858, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1709.6, + "completions/max_terminated_length": 1709.6, + "completions/mean_length": 1273.3875, + "completions/mean_terminated_length": 1273.3875, + "completions/min_length": 930.0, + "completions/min_terminated_length": 930.0, + "entropy": 0.1402006097137928, + "epoch": 0.052, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.053286418318748474, + "kl": 0.9155610263347626, + "learning_rate": 9.945695283911661e-06, + "loss": 0.0008199753239750863, + "num_tokens": 12194619.0, + "reward": 1.1976250171661378, + "reward_std": 0.28514082431793214, + "rewards/env_game_reward/mean": 1.1976250171661378, + "rewards/env_game_reward/std": 0.36828628182411194, + "sampling/importance_sampling_ratio/max": 1.6765745401382446, + "sampling/importance_sampling_ratio/mean": 0.9975801944732666, + "sampling/importance_sampling_ratio/min": 0.5081476271152496, + "sampling/sampling_logp_difference/max": 0.6622140645980835, + "sampling/sampling_logp_difference/mean": 0.014845983497798443, + "step": 650, + "step_time": 5.861162030600099 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.002022058889269829, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002022058889269829, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1546.0, + "completions/max_terminated_length": 1546.0, + "completions/mean_length": 1165.9375, + "completions/mean_terminated_length": 1165.9375, + "completions/min_length": 874.4, + "completions/min_terminated_length": 874.4, + "entropy": 0.13178882375359535, + "epoch": 0.0524, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.07173304259777069, + "kl": 1.5115601539611816, + "learning_rate": 9.94561444132927e-06, + "loss": 0.000665221968665719, + "num_tokens": 12329314.0, + "reward": 1.0957500100135804, + "reward_std": 0.3418861448764801, + "rewards/env_game_reward/mean": 1.0957500100135804, + "rewards/env_game_reward/std": 0.41972014904022215, + "sampling/importance_sampling_ratio/max": 1.5416927576065063, + "sampling/importance_sampling_ratio/mean": 1.0306774377822876, + "sampling/importance_sampling_ratio/min": 0.6824665546417237, + "sampling/sampling_logp_difference/max": 0.5923709630966186, + "sampling/sampling_logp_difference/mean": 0.014958329871296883, + "step": 655, + "step_time": 4.971940669400647 + }, + { + "clip_ratio/high_max": 0.005824592150747776, + "clip_ratio/high_mean": 0.003932704217731953, + "clip_ratio/low_mean": 0.001981946639716625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.005914650764316321, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1711.8, + "completions/max_terminated_length": 1711.8, + "completions/mean_length": 1323.8, + "completions/mean_terminated_length": 1323.8, + "completions/min_length": 925.4, + "completions/min_terminated_length": 925.4, + "entropy": 0.11637530401349068, + "epoch": 0.0528, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.04864488169550896, + "kl": 0.9532750248908997, + "learning_rate": 9.945532943680585e-06, + "loss": 0.001364955585449934, + "num_tokens": 12478993.0, + "reward": 1.1640416860580445, + "reward_std": 0.30364345014095306, + "rewards/env_game_reward/mean": 1.1640416860580445, + "rewards/env_game_reward/std": 0.3607664942741394, + "sampling/importance_sampling_ratio/max": 1.532017135620117, + "sampling/importance_sampling_ratio/mean": 0.9525379180908203, + "sampling/importance_sampling_ratio/min": 0.4082610189914703, + "sampling/sampling_logp_difference/max": 0.7750426650047302, + "sampling/sampling_logp_difference/mean": 0.01715007275342941, + "step": 660, + "step_time": 5.538303243199334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.004831413738429546, + "clip_ratio/low_min": 0.00181818176060915, + "clip_ratio/region_mean": 0.004831413738429546, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.8, + "completions/max_terminated_length": 1708.8, + "completions/mean_length": 1283.0875, + "completions/mean_terminated_length": 1283.0875, + "completions/min_length": 951.2, + "completions/min_terminated_length": 951.2, + "entropy": 0.08296727910637855, + "epoch": 0.0532, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.0572076216340065, + "kl": 0.8097724616527557, + "learning_rate": 9.945450790979927e-06, + "loss": 0.0013984257355332374, + "num_tokens": 12624480.0, + "reward": 1.2450417280197144, + "reward_std": 0.2561494290828705, + "rewards/env_game_reward/mean": 1.2450417280197144, + "rewards/env_game_reward/std": 0.3560167372226715, + "sampling/importance_sampling_ratio/max": 1.6131016254425048, + "sampling/importance_sampling_ratio/mean": 0.9815467715263366, + "sampling/importance_sampling_ratio/min": 0.6314162969589233, + "sampling/sampling_logp_difference/max": 0.6175156712532044, + "sampling/sampling_logp_difference/mean": 0.012766172736883163, + "step": 665, + "step_time": 5.5065391923984865 + }, + { + "clip_ratio/high_max": 0.003603896126151085, + "clip_ratio/high_mean": 0.0018019480630755424, + "clip_ratio/low_mean": 0.002848330978304148, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00465027904137969, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1633.8, + "completions/max_terminated_length": 1633.8, + "completions/mean_length": 1327.1375, + "completions/mean_terminated_length": 1327.1375, + "completions/min_length": 937.4, + "completions/min_terminated_length": 937.4, + "entropy": 0.11601686254143714, + "epoch": 0.0536, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.0497339628636837, + "kl": 2.4915341794490815, + "learning_rate": 9.94536798324174e-06, + "loss": 0.0018591852858662605, + "num_tokens": 12775518.0, + "reward": 1.0752083778381347, + "reward_std": 0.42891920208930967, + "rewards/env_game_reward/mean": 1.0752083778381347, + "rewards/env_game_reward/std": 0.43512017130851743, + "sampling/importance_sampling_ratio/max": 1.531420397758484, + "sampling/importance_sampling_ratio/mean": 1.0097869038581848, + "sampling/importance_sampling_ratio/min": 0.5700468063354492, + "sampling/sampling_logp_difference/max": 0.4885990142822266, + "sampling/sampling_logp_difference/mean": 0.01330602504312992, + "step": 670, + "step_time": 5.495691229999648 + }, + { + "clip_ratio/high_max": 0.0038838613778352737, + "clip_ratio/high_mean": 0.0019419306889176368, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019419306889176368, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.6, + "completions/max_terminated_length": 1708.6, + "completions/mean_length": 1330.8, + "completions/mean_terminated_length": 1330.8, + "completions/min_length": 972.6, + "completions/min_terminated_length": 972.6, + "entropy": 0.10391684621572495, + "epoch": 0.054, + "frac_reward_zero_std": 0.275, + "grad_norm": 0.060416221618652344, + "kl": 0.8995959877967834, + "learning_rate": 9.945284520480583e-06, + "loss": 0.0008666712790727615, + "num_tokens": 12925994.0, + "reward": 1.2439167022705078, + "reward_std": 0.22910259962081908, + "rewards/env_game_reward/mean": 1.2439167022705078, + "rewards/env_game_reward/std": 0.35838334560394286, + "sampling/importance_sampling_ratio/max": 1.6800979375839233, + "sampling/importance_sampling_ratio/mean": 1.035771083831787, + "sampling/importance_sampling_ratio/min": 0.751843523979187, + "sampling/sampling_logp_difference/max": 0.5335385739803314, + "sampling/sampling_logp_difference/mean": 0.01053644772619009, + "step": 675, + "step_time": 5.700633796999318 + }, + { + "epoch": 0.054, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1647.0, + "eval_completions/max_terminated_length": 1647.0, + "eval_completions/mean_length": 1418.6666666666667, + "eval_completions/mean_terminated_length": 1418.6666666666667, + "eval_completions/min_length": 1066.3333333333333, + "eval_completions/min_terminated_length": 1066.3333333333333, + "eval_entropy": 0.14034629861513773, + "eval_frac_reward_zero_std": 0.0, + "eval_kl": 0.8713982502619425, + "eval_loss": -0.0012209699489176273, + "eval_num_tokens": 12925994.0, + "eval_reward": 1.1308333277702332, + "eval_reward_std": 0.34923218687375385, + "eval_rewards/env_game_reward/mean": 1.1308333277702332, + "eval_rewards/env_game_reward/std": 0.39376481374104816, + "eval_runtime": 5.821, + "eval_samples_per_second": 1.718, + "eval_sampling/importance_sampling_ratio/max": 1.0657129685084026, + "eval_sampling/importance_sampling_ratio/mean": 0.9151384830474854, + "eval_sampling/importance_sampling_ratio/min": 0.6475860774517059, + "eval_sampling/sampling_logp_difference/max": 0.4063406785329183, + "eval_sampling/sampling_logp_difference/mean": 0.01308465547238787, + "eval_steps_per_second": 0.344, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0019294990226626397, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019294990226626397, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1713.6, + "completions/max_terminated_length": 1713.6, + "completions/mean_length": 1285.875, + "completions/mean_terminated_length": 1285.875, + "completions/min_length": 907.2, + "completions/min_terminated_length": 907.2, + "entropy": 0.1750231884419918, + "epoch": 0.0544, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.10211564600467682, + "kl": 0.8659535050392151, + "learning_rate": 9.945200402711124e-06, + "loss": 0.0010523485019803048, + "num_tokens": 13072169.0, + "reward": 1.1676250457763673, + "reward_std": 0.29786873459815977, + "rewards/env_game_reward/mean": 1.1676250457763673, + "rewards/env_game_reward/std": 0.4279802978038788, + "sampling/importance_sampling_ratio/max": 1.4572262048721314, + "sampling/importance_sampling_ratio/mean": 1.0047440648078918, + "sampling/importance_sampling_ratio/min": 0.6181734561920166, + "sampling/sampling_logp_difference/max": 0.5347591876983643, + "sampling/sampling_logp_difference/mean": 0.013022671453654766, + "step": 680, + "step_time": 5.70217539860023 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0010416666977107526, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416666977107526, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1707.0, + "completions/max_terminated_length": 1707.0, + "completions/mean_length": 1213.6875, + "completions/mean_terminated_length": 1213.6875, + "completions/min_length": 914.6, + "completions/min_terminated_length": 914.6, + "entropy": 0.16222424656152726, + "epoch": 0.0548, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.046301327645778656, + "kl": 1.311411839723587, + "learning_rate": 9.945115629948152e-06, + "loss": 0.0011385465040802956, + "num_tokens": 13211179.0, + "reward": 1.187375044822693, + "reward_std": 0.2891477644443512, + "rewards/env_game_reward/mean": 1.187375044822693, + "rewards/env_game_reward/std": 0.40909904837608335, + "sampling/importance_sampling_ratio/max": 1.319038200378418, + "sampling/importance_sampling_ratio/mean": 0.9719479799270629, + "sampling/importance_sampling_ratio/min": 0.6795879960060119, + "sampling/sampling_logp_difference/max": 0.41148210763931276, + "sampling/sampling_logp_difference/mean": 0.010176723822951317, + "step": 685, + "step_time": 5.558335913400515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0010416666977107526, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010416666977107526, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1661.6, + "completions/max_terminated_length": 1661.6, + "completions/mean_length": 1282.8875, + "completions/mean_terminated_length": 1282.8875, + "completions/min_length": 927.4, + "completions/min_terminated_length": 927.4, + "entropy": 0.15061740726232528, + "epoch": 0.0552, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.03988010436296463, + "kl": 0.9013077259063721, + "learning_rate": 9.945030202206567e-06, + "loss": 0.0009480739012360573, + "num_tokens": 13357280.0, + "reward": 1.1207083702087401, + "reward_std": 0.29786873757839205, + "rewards/env_game_reward/mean": 1.1207083702087401, + "rewards/env_game_reward/std": 0.4572018027305603, + "sampling/importance_sampling_ratio/max": 1.295949673652649, + "sampling/importance_sampling_ratio/mean": 1.021243405342102, + "sampling/importance_sampling_ratio/min": 0.7810358166694641, + "sampling/sampling_logp_difference/max": 0.33255696296691895, + "sampling/sampling_logp_difference/mean": 0.010920869559049607, + "step": 690, + "step_time": 5.616496510598518 + }, + { + "clip_ratio/high_max": 0.0018867924809455872, + "clip_ratio/high_mean": 0.0009433962404727936, + "clip_ratio/low_mean": 0.0018424611538648606, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002785857394337654, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1627.0, + "completions/max_terminated_length": 1627.0, + "completions/mean_length": 1297.0, + "completions/mean_terminated_length": 1297.0, + "completions/min_length": 929.6, + "completions/min_terminated_length": 929.6, + "entropy": 0.17935092002153397, + "epoch": 0.0556, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.04438179358839989, + "kl": 1.0830827236175538, + "learning_rate": 9.944944119501391e-06, + "loss": 0.00012169899418950081, + "num_tokens": 13503504.0, + "reward": 1.102041745185852, + "reward_std": 0.4174876242876053, + "rewards/env_game_reward/mean": 1.102041745185852, + "rewards/env_game_reward/std": 0.4797330558300018, + "sampling/importance_sampling_ratio/max": 1.2585298061370849, + "sampling/importance_sampling_ratio/mean": 0.991274642944336, + "sampling/importance_sampling_ratio/min": 0.684896045923233, + "sampling/sampling_logp_difference/max": 0.49414026737213135, + "sampling/sampling_logp_difference/mean": 0.011651785112917423, + "step": 695, + "step_time": 5.530571238000266 + }, + { + "clip_ratio/high_max": 0.0019999999552965165, + "clip_ratio/high_mean": 0.0009999999776482583, + "clip_ratio/low_mean": 0.0019615384750068187, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002961538452655077, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1601.0, + "completions/max_terminated_length": 1601.0, + "completions/mean_length": 1268.6, + "completions/mean_terminated_length": 1268.6, + "completions/min_length": 884.4, + "completions/min_terminated_length": 884.4, + "entropy": 0.17323798462748527, + "epoch": 0.056, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.06453540921211243, + "kl": 1.1303745985031128, + "learning_rate": 9.94485738184775e-06, + "loss": 0.004169812053442001, + "num_tokens": 13648095.0, + "reward": 1.2017917394638062, + "reward_std": 0.3141322135925293, + "rewards/env_game_reward/mean": 1.2017917394638062, + "rewards/env_game_reward/std": 0.4132618486881256, + "sampling/importance_sampling_ratio/max": 1.5371316432952882, + "sampling/importance_sampling_ratio/mean": 0.9977814793586731, + "sampling/importance_sampling_ratio/min": 0.6063427329063416, + "sampling/sampling_logp_difference/max": 0.4800256252288818, + "sampling/sampling_logp_difference/mean": 0.0133250679820776, + "step": 700, + "step_time": 5.324122882598749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0009433962404727936, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009433962404727936, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1708.0, + "completions/max_terminated_length": 1708.0, + "completions/mean_length": 1299.7, + "completions/mean_terminated_length": 1299.7, + "completions/min_length": 908.8, + "completions/min_terminated_length": 908.8, + "entropy": 0.12524148747324942, + "epoch": 0.0564, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.05619892105460167, + "kl": 1.2845276713371276, + "learning_rate": 9.944769989260896e-06, + "loss": 2.784114331007004e-05, + "num_tokens": 13796057.0, + "reward": 1.2679583549499511, + "reward_std": 0.28408015966415406, + "rewards/env_game_reward/mean": 1.2679583549499511, + "rewards/env_game_reward/std": 0.3855504870414734, + "sampling/importance_sampling_ratio/max": 1.3967360258102417, + "sampling/importance_sampling_ratio/mean": 1.0201675534248351, + "sampling/importance_sampling_ratio/min": 0.6278232634067535, + "sampling/sampling_logp_difference/max": 0.47589802742004395, + "sampling/sampling_logp_difference/mean": 0.010255707520991563, + "step": 705, + "step_time": 5.642836955800158 + }, + { + "clip_ratio/high_max": 0.002083333395421505, + "clip_ratio/high_mean": 0.0010416666977107526, + "clip_ratio/low_mean": 0.0018894830718636513, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002931149769574404, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1591.0, + "completions/max_terminated_length": 1591.0, + "completions/mean_length": 1210.55, + "completions/mean_terminated_length": 1210.55, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "entropy": 0.11609519273042679, + "epoch": 0.0568, + "frac_reward_zero_std": 0.025, + "grad_norm": 0.07330711930990219, + "kl": 1.102458918094635, + "learning_rate": 9.944681941756187e-06, + "loss": -0.00021149502135813235, + "num_tokens": 13934913.0, + "reward": 1.2933750391006469, + "reward_std": 0.2579171985387802, + "rewards/env_game_reward/mean": 1.2933750391006469, + "rewards/env_game_reward/std": 0.3375827193260193, + "sampling/importance_sampling_ratio/max": 1.4367228507995606, + "sampling/importance_sampling_ratio/mean": 0.9958265781402588, + "sampling/importance_sampling_ratio/min": 0.5978623509407044, + "sampling/sampling_logp_difference/max": 0.5757256031036377, + "sampling/sampling_logp_difference/mean": 0.011967730149626732, + "step": 710, + "step_time": 5.246779600399895 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1703.6, + "completions/max_terminated_length": 1703.6, + "completions/mean_length": 1318.8625, + "completions/mean_terminated_length": 1318.8625, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "entropy": 0.09110566601157188, + "epoch": 0.0572, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.03678731247782707, + "kl": 1.1454283237457275, + "learning_rate": 9.944593239349106e-06, + "loss": -0.00031706057488918306, + "num_tokens": 14084958.0, + "reward": 1.3445000171661377, + "reward_std": 0.2526728391647339, + "rewards/env_game_reward/mean": 1.3445000171661377, + "rewards/env_game_reward/std": 0.33565597534179686, + "sampling/importance_sampling_ratio/max": 1.3654388189315796, + "sampling/importance_sampling_ratio/mean": 1.0032867074012757, + "sampling/importance_sampling_ratio/min": 0.7344457149505615, + "sampling/sampling_logp_difference/max": 0.3897515535354614, + "sampling/sampling_logp_difference/mean": 0.0067275471054017546, + "step": 715, + "step_time": 5.783622994599137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0019615384750068187, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0019615384750068187, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1663.6, + "completions/max_terminated_length": 1663.6, + "completions/mean_length": 1257.9125, + "completions/mean_terminated_length": 1257.9125, + "completions/min_length": 884.6, + "completions/min_terminated_length": 884.6, + "entropy": 0.09665783047676087, + "epoch": 0.0576, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.011852607131004333, + "kl": 1.0820326685905457, + "learning_rate": 9.944503882055243e-06, + "loss": 0.0012081136927008628, + "num_tokens": 14228074.0, + "reward": 1.3299166679382324, + "reward_std": 0.2917994186282158, + "rewards/env_game_reward/mean": 1.3299166679382324, + "rewards/env_game_reward/std": 0.33310834765434266, + "sampling/importance_sampling_ratio/max": 1.1939823865890502, + "sampling/importance_sampling_ratio/mean": 1.0095659494400024, + "sampling/importance_sampling_ratio/min": 0.8032400846481323, + "sampling/sampling_logp_difference/max": 0.28812804222106936, + "sampling/sampling_logp_difference/mean": 0.006528158858418465, + "step": 720, + "step_time": 5.56530989899984 + }, + { + "clip_ratio/high_max": 0.00181818176060915, + "clip_ratio/high_mean": 0.0014904862269759177, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0014904862269759177, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1624.6, + "completions/max_terminated_length": 1624.6, + "completions/mean_length": 1252.5625, + "completions/mean_terminated_length": 1252.5625, + "completions/min_length": 871.8, + "completions/min_terminated_length": 871.8, + "entropy": 0.11173812597990036, + "epoch": 0.058, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.06697779148817062, + "kl": 1.3808134913444519, + "learning_rate": 9.944413869890304e-06, + "loss": -0.0006437717471271754, + "num_tokens": 14371761.0, + "reward": 1.2972917079925537, + "reward_std": 0.228572279214859, + "rewards/env_game_reward/mean": 1.2972917079925537, + "rewards/env_game_reward/std": 0.29336669147014616, + "sampling/importance_sampling_ratio/max": 1.120638871192932, + "sampling/importance_sampling_ratio/mean": 0.9524656891822815, + "sampling/importance_sampling_ratio/min": 0.5668362379074097, + "sampling/sampling_logp_difference/max": 0.6708258748054504, + "sampling/sampling_logp_difference/mean": 0.00866871876642108, + "step": 725, + "step_time": 5.52822365559914 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0029842125251889227, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0029842125251889227, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1628.2, + "completions/max_terminated_length": 1628.2, + "completions/mean_length": 1275.55, + "completions/mean_terminated_length": 1275.55, + "completions/min_length": 883.2, + "completions/min_terminated_length": 883.2, + "entropy": 0.1060483768582344, + "epoch": 0.0584, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022124866023659706, + "kl": 1.2471987664699555, + "learning_rate": 9.944323202870115e-06, + "loss": 0.0006659584119915962, + "num_tokens": 14517088.0, + "reward": 1.312000036239624, + "reward_std": 0.22073518633842468, + "rewards/env_game_reward/mean": 1.312000036239624, + "rewards/env_game_reward/std": 0.3286490172147751, + "sampling/importance_sampling_ratio/max": 1.438795232772827, + "sampling/importance_sampling_ratio/mean": 0.9970067143440247, + "sampling/importance_sampling_ratio/min": 0.6118991255760193, + "sampling/sampling_logp_difference/max": 0.4648634433746338, + "sampling/sampling_logp_difference/mean": 0.009209131356328726, + "step": 730, + "step_time": 5.445555883600173 + }, + { + "clip_ratio/high_max": 0.001960784383118153, + "clip_ratio/high_mean": 0.0009803921915590764, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009803921915590764, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1680.2, + "completions/max_terminated_length": 1680.2, + "completions/mean_length": 1208.5, + "completions/mean_terminated_length": 1208.5, + "completions/min_length": 853.2, + "completions/min_terminated_length": 853.2, + "entropy": 0.10054874122142791, + "epoch": 0.0588, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.052741412073373795, + "kl": 1.1816912770271302, + "learning_rate": 9.944231881010614e-06, + "loss": 0.0007746644783765078, + "num_tokens": 14657077.0, + "reward": 1.2604999780654906, + "reward_std": 0.22438856363296508, + "rewards/env_game_reward/mean": 1.2604999780654906, + "rewards/env_game_reward/std": 0.2969023913145065, + "sampling/importance_sampling_ratio/max": 1.2568823099136353, + "sampling/importance_sampling_ratio/mean": 0.9825718402862549, + "sampling/importance_sampling_ratio/min": 0.7182797014713287, + "sampling/sampling_logp_difference/max": 0.42328827977180483, + "sampling/sampling_logp_difference/mean": 0.009410615637898445, + "step": 735, + "step_time": 5.590113839400146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0027369407005608084, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0027369407005608084, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1674.6, + "completions/max_terminated_length": 1674.6, + "completions/mean_length": 1325.8, + "completions/mean_terminated_length": 1325.8, + "completions/min_length": 881.8, + "completions/min_terminated_length": 881.8, + "entropy": 0.08453329205513001, + "epoch": 0.0592, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.05587141960859299, + "kl": 1.1068053007125855, + "learning_rate": 9.944139904327855e-06, + "loss": 0.001067483052611351, + "num_tokens": 14807478.0, + "reward": 1.3697916984558105, + "reward_std": 0.21584435254335405, + "rewards/env_game_reward/mean": 1.3697916984558105, + "rewards/env_game_reward/std": 0.29844184815883634, + "sampling/importance_sampling_ratio/max": 1.4578536033630372, + "sampling/importance_sampling_ratio/mean": 1.0180188417434692, + "sampling/importance_sampling_ratio/min": 0.7393244504928589, + "sampling/sampling_logp_difference/max": 0.47165329456329347, + "sampling/sampling_logp_difference/mean": 0.007859865296632051, + "step": 740, + "step_time": 5.500019186200371 + }, + { + "clip_ratio/high_max": 0.0019999999552965165, + "clip_ratio/high_mean": 0.0009999999776482583, + "clip_ratio/low_mean": 0.0015050167683511972, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002505016792565584, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1540.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 1221.325, + "completions/mean_terminated_length": 1221.325, + "completions/min_length": 889.2, + "completions/min_terminated_length": 889.2, + "entropy": 0.17823160253465176, + "epoch": 0.0596, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.2046627402305603, + "kl": 2.1721172749996187, + "learning_rate": 9.944047272838003e-06, + "loss": 0.005118013173341751, + "num_tokens": 14947959.0, + "reward": 1.328333306312561, + "reward_std": 0.23499515950679778, + "rewards/env_game_reward/mean": 1.328333306312561, + "rewards/env_game_reward/std": 0.3135849416255951, + "sampling/importance_sampling_ratio/max": 1.5344476461410523, + "sampling/importance_sampling_ratio/mean": 1.0067410469055176, + "sampling/importance_sampling_ratio/min": 0.5616122364997982, + "sampling/sampling_logp_difference/max": 5.125636863708496, + "sampling/sampling_logp_difference/mean": 0.022377347387373448, + "step": 745, + "step_time": 5.2221048694002095 + }, + { + "clip_ratio/high_max": 0.004959130194038153, + "clip_ratio/high_mean": 0.0024795650970190763, + "clip_ratio/low_mean": 0.0043784501031041145, + "clip_ratio/low_min": 0.0013333333656191826, + "clip_ratio/region_mean": 0.006858015200123191, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1596.4, + "completions/max_terminated_length": 1596.4, + "completions/mean_length": 1219.2125, + "completions/mean_terminated_length": 1219.2125, + "completions/min_length": 906.8, + "completions/min_terminated_length": 906.8, + "entropy": 0.19808728545904158, + "epoch": 0.06, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028860095888376236, + "kl": 1.1904378652572631, + "learning_rate": 9.943953986557342e-06, + "loss": 0.0009089265018701554, + "num_tokens": 15088378.0, + "reward": 1.3279583692550658, + "reward_std": 0.21242666840553284, + "rewards/env_game_reward/mean": 1.3279583692550658, + "rewards/env_game_reward/std": 0.3157786726951599, + "sampling/importance_sampling_ratio/max": 1.2873046875, + "sampling/importance_sampling_ratio/mean": 0.9388688564300537, + "sampling/importance_sampling_ratio/min": 0.37807847261428834, + "sampling/sampling_logp_difference/max": 0.7840029001235962, + "sampling/sampling_logp_difference/mean": 0.016058788914233447, + "step": 750, + "step_time": 5.506602738799847 + }, + { + "epoch": 0.06, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1642.3333333333333, + "eval_completions/max_terminated_length": 1642.3333333333333, + "eval_completions/mean_length": 1389.2916666666667, + "eval_completions/mean_terminated_length": 1389.2916666666667, + "eval_completions/min_length": 1053.0, + "eval_completions/min_terminated_length": 1053.0, + "eval_entropy": 0.16020495692888895, + "eval_frac_reward_zero_std": 0.16666666666666666, + "eval_kl": 1.373230218887329, + "eval_loss": -0.0009800799889490008, + "eval_num_tokens": 15088378.0, + "eval_reward": 1.1752777894337971, + "eval_reward_std": 0.38537317017714184, + "eval_rewards/env_game_reward/mean": 1.1752777894337971, + "eval_rewards/env_game_reward/std": 0.42335541049639386, + "eval_runtime": 5.7638, + "eval_samples_per_second": 1.735, + "eval_sampling/importance_sampling_ratio/max": 1.4313355684280396, + "eval_sampling/importance_sampling_ratio/mean": 1.0740643541018169, + "eval_sampling/importance_sampling_ratio/min": 0.8583299318949381, + "eval_sampling/sampling_logp_difference/max": 0.40136027336120605, + "eval_sampling/sampling_logp_difference/mean": 0.011960081600894531, + "eval_steps_per_second": 0.347, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0028861789032816887, + "clip_ratio/high_mean": 0.002068089507520199, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002068089507520199, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1881.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1388.0875, + "completions/mean_terminated_length": 1388.0875, + "completions/min_length": 982.2, + "completions/min_terminated_length": 982.2, + "entropy": 0.2721471354365349, + "epoch": 0.0604, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.13366009294986725, + "kl": 1.4383060097694398, + "learning_rate": 9.943860045502275e-06, + "loss": 0.0013721118681132793, + "num_tokens": 15242399.0, + "reward": 1.1892500162124633, + "reward_std": 0.40534055829048155, + "rewards/env_game_reward/mean": 1.1892500162124633, + "rewards/env_game_reward/std": 0.4778396964073181, + "sampling/importance_sampling_ratio/max": 1.5637898206710816, + "sampling/importance_sampling_ratio/mean": 1.0453999996185304, + "sampling/importance_sampling_ratio/min": 0.7041348934173584, + "sampling/sampling_logp_difference/max": 0.39543609619140624, + "sampling/sampling_logp_difference/mean": 0.016556292213499545, + "step": 755, + "step_time": 6.289189056601754 + }, + { + "clip_ratio/high_max": 0.001666666753590107, + "clip_ratio/high_mean": 0.0008333333767950535, + "clip_ratio/low_mean": 0.0016708438284695148, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002504177112132311, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1989.6, + "completions/max_terminated_length": 1989.6, + "completions/mean_length": 1483.5125, + "completions/mean_terminated_length": 1483.5125, + "completions/min_length": 948.0, + "completions/min_terminated_length": 948.0, + "entropy": 0.22058189362287522, + "epoch": 0.0608, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.055043403059244156, + "kl": 1.480393397808075, + "learning_rate": 9.943765449689311e-06, + "loss": 0.0006418362259864807, + "num_tokens": 15404568.0, + "reward": 1.2230416774749755, + "reward_std": 0.26340569257736207, + "rewards/env_game_reward/mean": 1.2230416774749755, + "rewards/env_game_reward/std": 0.3767798840999603, + "sampling/importance_sampling_ratio/max": 1.6279995679855346, + "sampling/importance_sampling_ratio/mean": 1.014073097705841, + "sampling/importance_sampling_ratio/min": 0.5342595517635346, + "sampling/sampling_logp_difference/max": 0.5853055238723754, + "sampling/sampling_logp_difference/mean": 0.015527874231338501, + "step": 760, + "step_time": 6.58562049539978 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0008620689623057842, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0008620689623057842, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1925.0, + "completions/max_terminated_length": 1925.0, + "completions/mean_length": 1438.575, + "completions/mean_terminated_length": 1438.575, + "completions/min_length": 978.4, + "completions/min_terminated_length": 978.4, + "entropy": 0.1898469567298889, + "epoch": 0.0612, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.04669173061847687, + "kl": 1.7866320490837098, + "learning_rate": 9.94367019913508e-06, + "loss": -0.0005762927234172821, + "num_tokens": 15563247.0, + "reward": 1.3553274154663086, + "reward_std": 0.27411331832408903, + "rewards/env_game_reward/mean": 1.3553274154663086, + "rewards/env_game_reward/std": 0.3332707077264786, + "sampling/importance_sampling_ratio/max": 1.5550270080566406, + "sampling/importance_sampling_ratio/mean": 1.0086857557296753, + "sampling/importance_sampling_ratio/min": 0.6248692035675049, + "sampling/sampling_logp_difference/max": 0.5656398415565491, + "sampling/sampling_logp_difference/mean": 0.016817177832126617, + "step": 765, + "step_time": 6.523562220599706 + }, + { + "clip_ratio/high_max": 0.0034807992400601507, + "clip_ratio/high_mean": 0.0017403996200300754, + "clip_ratio/low_mean": 0.0008576392603572458, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0025980389036703855, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2061.0, + "completions/max_terminated_length": 2061.0, + "completions/mean_length": 1509.6, + "completions/mean_terminated_length": 1509.6, + "completions/min_length": 1017.8, + "completions/min_terminated_length": 1017.8, + "entropy": 0.11955822706222534, + "epoch": 0.0616, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.10209216922521591, + "kl": 1.0451581090688706, + "learning_rate": 9.943574293856327e-06, + "loss": 0.01794065982103348, + "num_tokens": 15726895.0, + "reward": 1.4529226541519165, + "reward_std": 0.21203944683074952, + "rewards/env_game_reward/mean": 1.4529226541519165, + "rewards/env_game_reward/std": 0.31403881311416626, + "sampling/importance_sampling_ratio/max": 1.901965069770813, + "sampling/importance_sampling_ratio/mean": 0.8959596395492554, + "sampling/importance_sampling_ratio/min": 0.21005995989909126, + "sampling/sampling_logp_difference/max": 4.415722644329071, + "sampling/sampling_logp_difference/mean": 0.022534814849495887, + "step": 770, + "step_time": 8.61529116679958 + }, + { + "clip_ratio/high_max": 0.001954938913695514, + "clip_ratio/high_mean": 0.000977469456847757, + "clip_ratio/low_mean": 0.0013538596511352807, + "clip_ratio/low_min": 0.0003663003677502275, + "clip_ratio/region_mean": 0.0023313291429076346, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 1926.0, + "completions/max_terminated_length": 1923.6, + "completions/mean_length": 1459.275, + "completions/mean_terminated_length": 1450.872509765625, + "completions/min_length": 964.2, + "completions/min_terminated_length": 964.2, + "entropy": 0.12240489572286606, + "epoch": 0.062, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.10321921855211258, + "kl": 0.7054299473762512, + "learning_rate": 9.943477733869912e-06, + "loss": 0.037728136777877806, + "num_tokens": 15885232.0, + "reward": 1.430791687965393, + "reward_std": 0.20286390632390977, + "rewards/env_game_reward/mean": 1.430791687965393, + "rewards/env_game_reward/std": 0.3058221280574799, + "sampling/importance_sampling_ratio/max": 1.7364504098892213, + "sampling/importance_sampling_ratio/mean": 0.7705297708511353, + "sampling/importance_sampling_ratio/min": 0.07456759945838565, + "sampling/sampling_logp_difference/max": 4.6616837739944454, + "sampling/sampling_logp_difference/mean": 0.01805206313729286, + "step": 775, + "step_time": 9.152791061800963 + }, + { + "clip_ratio/high_max": 0.0029415366472676395, + "clip_ratio/high_mean": 0.001613421703223139, + "clip_ratio/low_mean": 0.002161662018625066, + "clip_ratio/low_min": 0.00038369541289284825, + "clip_ratio/region_mean": 0.0037750837625935675, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 2117.8, + "completions/max_terminated_length": 2057.8, + "completions/mean_length": 1638.35, + "completions/mean_terminated_length": 1615.8751708984375, + "completions/min_length": 1121.2, + "completions/min_terminated_length": 1121.2, + "entropy": 0.14629979468882084, + "epoch": 0.0624, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.10632387548685074, + "kl": 1.703850120306015, + "learning_rate": 9.943380519192805e-06, + "loss": 0.05711690783500671, + "num_tokens": 16060923.0, + "reward": 1.5017619371414184, + "reward_std": 0.15968829095363618, + "rewards/env_game_reward/mean": 1.5017619371414184, + "rewards/env_game_reward/std": 0.2627015709877014, + "sampling/importance_sampling_ratio/max": 1.9005746364593505, + "sampling/importance_sampling_ratio/mean": 0.852447235584259, + "sampling/importance_sampling_ratio/min": 0.006003885331677944, + "sampling/sampling_logp_difference/max": 10.413811373710633, + "sampling/sampling_logp_difference/mean": 0.0183597469702363, + "step": 780, + "step_time": 10.805634913398535 + }, + { + "clip_ratio/high_max": 0.002865154389292002, + "clip_ratio/high_mean": 0.0015746226534247398, + "clip_ratio/low_mean": 0.000639259337913245, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002213882014621049, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2010.4, + "completions/max_terminated_length": 1945.2, + "completions/mean_length": 1429.275, + "completions/mean_terminated_length": 1417.6500244140625, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 0.13611191138625145, + "epoch": 0.0628, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.08338388800621033, + "kl": 0.7149348020553589, + "learning_rate": 9.943282649842098e-06, + "loss": -0.01947730779647827, + "num_tokens": 16217048.0, + "reward": 1.4835833787918091, + "reward_std": 0.1830228090286255, + "rewards/env_game_reward/mean": 1.4835833787918091, + "rewards/env_game_reward/std": 0.2726488709449768, + "sampling/importance_sampling_ratio/max": 1.8740410327911377, + "sampling/importance_sampling_ratio/mean": 0.9315284371376038, + "sampling/importance_sampling_ratio/min": 0.29684333456025164, + "sampling/sampling_logp_difference/max": 4.141123366355896, + "sampling/sampling_logp_difference/mean": 0.009943006094545126, + "step": 785, + "step_time": 8.90153669700012 + }, + { + "clip_ratio/high_max": 0.0005181347019970417, + "clip_ratio/high_mean": 0.00025906735099852086, + "clip_ratio/low_mean": 0.0014250828709919006, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0016841501754242926, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2021.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1568.0, + "completions/mean_terminated_length": 1568.0, + "completions/min_length": 1089.6, + "completions/min_terminated_length": 1089.6, + "entropy": 0.1328461952507496, + "epoch": 0.0632, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.07064507901668549, + "kl": 0.66820108294487, + "learning_rate": 9.943184125834994e-06, + "loss": 0.03775088787078858, + "num_tokens": 16386132.0, + "reward": 1.5388035774230957, + "reward_std": 0.13462809175252916, + "rewards/env_game_reward/mean": 1.5388035774230957, + "rewards/env_game_reward/std": 0.2006644695997238, + "sampling/importance_sampling_ratio/max": 1.3795575380325318, + "sampling/importance_sampling_ratio/mean": 0.930507481098175, + "sampling/importance_sampling_ratio/min": 0.3511299118457078, + "sampling/sampling_logp_difference/max": 6.95696439743042, + "sampling/sampling_logp_difference/mean": 0.013957394100725651, + "step": 790, + "step_time": 8.834509170798992 + }, + { + "clip_ratio/high_max": 0.002479099528864026, + "clip_ratio/high_mean": 0.001239549764432013, + "clip_ratio/low_mean": 0.0016097922343760729, + "clip_ratio/low_min": 0.0006060605868697166, + "clip_ratio/region_mean": 0.002849341952241957, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 1934.6, + "completions/max_terminated_length": 1934.6, + "completions/mean_length": 1502.5125, + "completions/mean_terminated_length": 1505.343359375, + "completions/min_length": 978.2, + "completions/min_terminated_length": 978.2, + "entropy": 0.16015452295541763, + "epoch": 0.0636, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.14999434351921082, + "kl": 1.1131888061761857, + "learning_rate": 9.943084947188814e-06, + "loss": -0.012555241584777832, + "num_tokens": 16549995.0, + "reward": 1.5487381219863892, + "reward_std": 0.12269987612962723, + "rewards/env_game_reward/mean": 1.5487381219863892, + "rewards/env_game_reward/std": 0.21309578120708467, + "sampling/importance_sampling_ratio/max": 1.5328397512435914, + "sampling/importance_sampling_ratio/mean": 0.9361981153488159, + "sampling/importance_sampling_ratio/min": 0.22185008656667407, + "sampling/sampling_logp_difference/max": 7.313794112205505, + "sampling/sampling_logp_difference/mean": 0.02059700442478061, + "step": 795, + "step_time": 7.839289776599617 + }, + { + "clip_ratio/high_max": 0.0006250000093132258, + "clip_ratio/high_mean": 0.0003125000046566129, + "clip_ratio/low_mean": 0.0009224468318279833, + "clip_ratio/low_min": 0.0004545454401522875, + "clip_ratio/region_mean": 0.0012349468364845962, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1974.4, + "completions/max_terminated_length": 1974.4, + "completions/mean_length": 1553.1875, + "completions/mean_terminated_length": 1553.1875, + "completions/min_length": 1044.8, + "completions/min_terminated_length": 1044.8, + "entropy": 0.17186126336455346, + "epoch": 0.064, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.11871084570884705, + "kl": 1.1377224385738374, + "learning_rate": 9.942985113920988e-06, + "loss": 0.029259467124938966, + "num_tokens": 16717754.0, + "reward": 1.536363124847412, + "reward_std": 0.1405543178319931, + "rewards/env_game_reward/mean": 1.536363124847412, + "rewards/env_game_reward/std": 0.16022475957870483, + "sampling/importance_sampling_ratio/max": 1.7989203214645386, + "sampling/importance_sampling_ratio/mean": 1.0040669798851014, + "sampling/importance_sampling_ratio/min": 0.494415277243539, + "sampling/sampling_logp_difference/max": 5.424048852920532, + "sampling/sampling_logp_difference/mean": 0.013903583586215972, + "step": 800, + "step_time": 8.159463358601352 + }, + { + "clip_ratio/high_max": 0.0017976711736992002, + "clip_ratio/high_mean": 0.0008988355868496001, + "clip_ratio/low_mean": 0.0016219173092395067, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002520752896089107, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2027.0, + "completions/max_terminated_length": 2027.0, + "completions/mean_length": 1557.8875, + "completions/mean_terminated_length": 1553.0508544921875, + "completions/min_length": 1052.6, + "completions/min_terminated_length": 1052.6, + "entropy": 0.17057686448097228, + "epoch": 0.0644, + "frac_reward_zero_std": 0.3, + "grad_norm": 0.10110954195261002, + "kl": 0.8300488352775574, + "learning_rate": 9.942884626049074e-06, + "loss": 0.07206388711929321, + "num_tokens": 16886207.0, + "reward": 1.585892915725708, + "reward_std": 0.07475129663944244, + "rewards/env_game_reward/mean": 1.585892915725708, + "rewards/env_game_reward/std": 0.10948928892612457, + "sampling/importance_sampling_ratio/max": 1.4725745916366577, + "sampling/importance_sampling_ratio/mean": 0.9733787894248962, + "sampling/importance_sampling_ratio/min": 0.46375017166137694, + "sampling/sampling_logp_difference/max": 0.5177584171295166, + "sampling/sampling_logp_difference/mean": 0.008902581129223108, + "step": 805, + "step_time": 9.054756587599694 + }, + { + "clip_ratio/high_max": 0.004471949115395546, + "clip_ratio/high_mean": 0.002951120538637042, + "clip_ratio/low_mean": 0.0007151460275053978, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00366626656614244, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1901.2, + "completions/max_terminated_length": 1901.2, + "completions/mean_length": 1397.4125, + "completions/mean_terminated_length": 1397.4125, + "completions/min_length": 1015.4, + "completions/min_terminated_length": 1015.4, + "entropy": 0.43061949610710143, + "epoch": 0.0648, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.08036839962005615, + "kl": 2.31134774684906, + "learning_rate": 9.942783483590728e-06, + "loss": 0.010299382358789444, + "num_tokens": 17040061.0, + "reward": 1.5596429109573364, + "reward_std": 0.08182235956192016, + "rewards/env_game_reward/mean": 1.5596429109573364, + "rewards/env_game_reward/std": 0.13593876510858535, + "sampling/importance_sampling_ratio/max": 1.5340835809707642, + "sampling/importance_sampling_ratio/mean": 0.9579884886741639, + "sampling/importance_sampling_ratio/min": 0.2988631462066783, + "sampling/sampling_logp_difference/max": 6.111863076686859, + "sampling/sampling_logp_difference/mean": 0.035615741088986394, + "step": 810, + "step_time": 6.779588287798106 + }, + { + "clip_ratio/high_max": 0.002151205716654658, + "clip_ratio/high_mean": 0.001408936199732125, + "clip_ratio/low_mean": 0.0037830369081348183, + "clip_ratio/low_min": 0.001368796918541193, + "clip_ratio/region_mean": 0.005191973014734686, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1977.0, + "completions/max_terminated_length": 1977.0, + "completions/mean_length": 1473.0375, + "completions/mean_terminated_length": 1473.0375, + "completions/min_length": 974.4, + "completions/min_terminated_length": 974.4, + "entropy": 0.40927894711494445, + "epoch": 0.0652, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.07628747820854187, + "kl": 2.2936408281326295, + "learning_rate": 9.942681686563735e-06, + "loss": 0.008698154985904694, + "num_tokens": 17200509.0, + "reward": 1.5981250286102295, + "reward_std": 0.05533111207187176, + "rewards/env_game_reward/mean": 1.5981250286102295, + "rewards/env_game_reward/std": 0.08176134005188943, + "sampling/importance_sampling_ratio/max": 1.5122954607009889, + "sampling/importance_sampling_ratio/mean": 0.9499858260154724, + "sampling/importance_sampling_ratio/min": 0.2841934680938721, + "sampling/sampling_logp_difference/max": 9.145073175430298, + "sampling/sampling_logp_difference/mean": 0.07416130937635898, + "step": 815, + "step_time": 6.956511554600001 + }, + { + "clip_ratio/high_max": 0.0007575757801532746, + "clip_ratio/high_mean": 0.0003787878900766373, + "clip_ratio/low_mean": 0.0013752276543527841, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017540155444294215, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1854.4, + "completions/max_terminated_length": 1854.4, + "completions/mean_length": 1441.1875, + "completions/mean_terminated_length": 1441.1875, + "completions/min_length": 1054.6, + "completions/min_terminated_length": 1054.6, + "entropy": 0.34008778631687164, + "epoch": 0.0656, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.07586944103240967, + "kl": 2.400510585308075, + "learning_rate": 9.942579234985985e-06, + "loss": 0.012332186847925187, + "num_tokens": 17358344.0, + "reward": 1.5910000562667848, + "reward_std": 0.07530688047409058, + "rewards/env_game_reward/mean": 1.5910000562667848, + "rewards/env_game_reward/std": 0.12365454062819481, + "sampling/importance_sampling_ratio/max": 1.4079306602478028, + "sampling/importance_sampling_ratio/mean": 0.9352385640144348, + "sampling/importance_sampling_ratio/min": 0.277198314811837, + "sampling/sampling_logp_difference/max": 11.205174541473388, + "sampling/sampling_logp_difference/mean": 0.046147267892956735, + "step": 820, + "step_time": 6.762566551601049 + }, + { + "clip_ratio/high_max": 0.0006944444496184588, + "clip_ratio/high_mean": 0.0003472222248092294, + "clip_ratio/low_mean": 0.0016576209105551242, + "clip_ratio/low_min": 0.000283286115154624, + "clip_ratio/region_mean": 0.0020048431353643535, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1995.4, + "completions/max_terminated_length": 1995.4, + "completions/mean_length": 1518.4, + "completions/mean_terminated_length": 1518.4, + "completions/min_length": 1045.2, + "completions/min_terminated_length": 1045.2, + "entropy": 0.3576410233974457, + "epoch": 0.066, + "frac_reward_zero_std": 0.15, + "grad_norm": 0.10342220216989517, + "kl": 2.481777250766754, + "learning_rate": 9.942476128875491e-06, + "loss": 0.01274801343679428, + "num_tokens": 17523295.0, + "reward": 1.586625051498413, + "reward_std": 0.0768978700041771, + "rewards/env_game_reward/mean": 1.586625051498413, + "rewards/env_game_reward/std": 0.10694200098514557, + "sampling/importance_sampling_ratio/max": 1.6131411075592041, + "sampling/importance_sampling_ratio/mean": 0.9216183066368103, + "sampling/importance_sampling_ratio/min": 0.1865751624797475, + "sampling/sampling_logp_difference/max": 10.270191860198974, + "sampling/sampling_logp_difference/mean": 0.049572935700416564, + "step": 825, + "step_time": 7.239306969800237 + }, + { + "epoch": 0.066, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0, + "eval_completions/max_length": 1889.0, + "eval_completions/max_terminated_length": 1889.0, + "eval_completions/mean_length": 1637.9166666666667, + "eval_completions/mean_terminated_length": 1637.9166666666667, + "eval_completions/min_length": 1247.0, + "eval_completions/min_terminated_length": 1247.0, + "eval_entropy": 0.3013338545958201, + "eval_frac_reward_zero_std": 0.08333333333333333, + "eval_kl": 4.269202629725139, + "eval_loss": 0.009242034517228603, + "eval_num_tokens": 17523295.0, + "eval_reward": 1.555416742960612, + "eval_reward_std": 0.13493956004579863, + "eval_rewards/env_game_reward/mean": 1.555416742960612, + "eval_rewards/env_game_reward/std": 0.22197324534257254, + "eval_runtime": 7.4138, + "eval_samples_per_second": 1.349, + "eval_sampling/importance_sampling_ratio/max": 1.2260379791259766, + "eval_sampling/importance_sampling_ratio/mean": 0.9835768540700277, + "eval_sampling/importance_sampling_ratio/min": 0.669548749923706, + "eval_sampling/sampling_logp_difference/max": 0.3156093756357829, + "eval_sampling/sampling_logp_difference/mean": 0.013049931886295477, + "eval_steps_per_second": 0.27, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0026498251594603063, + "clip_ratio/high_mean": 0.0013249125797301531, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0013249125797301531, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1927.0, + "completions/max_terminated_length": 1927.0, + "completions/mean_length": 1415.7375, + "completions/mean_terminated_length": 1415.7375, + "completions/min_length": 1012.2, + "completions/min_terminated_length": 1012.2, + "entropy": 0.30437230616807937, + "epoch": 0.0664, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.0816667228937149, + "kl": 2.4667917609214784, + "learning_rate": 9.942372368250377e-06, + "loss": 0.0189794585108757, + "num_tokens": 17678924.0, + "reward": 1.5786250352859497, + "reward_std": 0.09740396738052368, + "rewards/env_game_reward/mean": 1.5786250352859497, + "rewards/env_game_reward/std": 0.16835185438394545, + "sampling/importance_sampling_ratio/max": 1.5452775001525878, + "sampling/importance_sampling_ratio/mean": 0.9819482445716858, + "sampling/importance_sampling_ratio/min": 0.520148062706763, + "sampling/sampling_logp_difference/max": 4.023156452178955, + "sampling/sampling_logp_difference/mean": 0.033988011069595814, + "step": 830, + "step_time": 6.711538025199843 + }, + { + "clip_ratio/high_max": 0.0032954429741948845, + "clip_ratio/high_mean": 0.0016477214870974422, + "clip_ratio/low_mean": 0.001202381821349263, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0028501033084467053, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1927.4, + "completions/max_terminated_length": 1927.4, + "completions/mean_length": 1432.5, + "completions/mean_terminated_length": 1432.5, + "completions/min_length": 1027.6, + "completions/min_terminated_length": 1027.6, + "entropy": 0.327534431219101, + "epoch": 0.0668, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.06545510143041611, + "kl": 3.1863216876983644, + "learning_rate": 9.942267953128885e-06, + "loss": 0.011105220019817352, + "num_tokens": 17836027.0, + "reward": 1.5842679262161254, + "reward_std": 0.07740294709801673, + "rewards/env_game_reward/mean": 1.5842679262161254, + "rewards/env_game_reward/std": 0.11849845722317695, + "sampling/importance_sampling_ratio/max": 1.4498932838439942, + "sampling/importance_sampling_ratio/mean": 0.9487195491790772, + "sampling/importance_sampling_ratio/min": 0.29165607984107556, + "sampling/sampling_logp_difference/max": 5.829511404037476, + "sampling/sampling_logp_difference/mean": 0.03138357251882553, + "step": 835, + "step_time": 6.9147914322013095 + }, + { + "clip_ratio/high_max": 0.004935299325734377, + "clip_ratio/high_mean": 0.0027280662674456836, + "clip_ratio/low_mean": 0.002076923102140427, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.00480498936958611, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1901.4, + "completions/max_terminated_length": 1901.4, + "completions/mean_length": 1447.075, + "completions/mean_terminated_length": 1447.075, + "completions/min_length": 1032.4, + "completions/min_terminated_length": 1032.4, + "entropy": 0.24285254254937172, + "epoch": 0.0672, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.21589381992816925, + "kl": 1.4673928499221802, + "learning_rate": 9.942162883529365e-06, + "loss": -0.01951298713684082, + "num_tokens": 17994453.0, + "reward": 1.362357211112976, + "reward_std": 0.26269018054008486, + "rewards/env_game_reward/mean": 1.362357211112976, + "rewards/env_game_reward/std": 0.3235168933868408, + "sampling/importance_sampling_ratio/max": 1.762507152557373, + "sampling/importance_sampling_ratio/mean": 1.026704490184784, + "sampling/importance_sampling_ratio/min": 0.6443805754184723, + "sampling/sampling_logp_difference/max": 0.44364252090454104, + "sampling/sampling_logp_difference/mean": 0.013111063651740551, + "step": 840, + "step_time": 6.5076123672013635 + }, + { + "clip_ratio/high_max": 0.0037218520883470774, + "clip_ratio/high_mean": 0.0018609260441735387, + "clip_ratio/low_mean": 0.0017191730381455272, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0035800991114228963, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2005.8, + "completions/max_terminated_length": 2005.8, + "completions/mean_length": 1515.8375, + "completions/mean_terminated_length": 1515.8375, + "completions/min_length": 1026.4, + "completions/min_terminated_length": 1026.4, + "entropy": 0.3497733250260353, + "epoch": 0.0676, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.09921447187662125, + "kl": 1.3188701629638673, + "learning_rate": 9.94205715947029e-06, + "loss": 0.010336892306804657, + "num_tokens": 18158649.0, + "reward": 1.3419702529907227, + "reward_std": 0.24037423133850097, + "rewards/env_game_reward/mean": 1.3419702529907227, + "rewards/env_game_reward/std": 0.3400465488433838, + "sampling/importance_sampling_ratio/max": 1.7033623218536378, + "sampling/importance_sampling_ratio/mean": 0.9735162138938904, + "sampling/importance_sampling_ratio/min": 0.48452032804489137, + "sampling/sampling_logp_difference/max": 0.5224750876426697, + "sampling/sampling_logp_difference/mean": 0.0164469039067626, + "step": 845, + "step_time": 7.678838656601147 + }, + { + "clip_ratio/high_max": 0.004272268549539149, + "clip_ratio/high_mean": 0.0021361342747695743, + "clip_ratio/low_mean": 0.0017409127380233259, + "clip_ratio/low_min": 0.00020811655558645726, + "clip_ratio/region_mean": 0.0038770470418967307, + "completions/clipped_ratio": 0.025, + "completions/max_length": 2252.8, + "completions/max_terminated_length": 2162.0, + "completions/mean_length": 1561.925, + "completions/mean_terminated_length": 1542.1192138671875, + "completions/min_length": 1022.8, + "completions/min_terminated_length": 1022.8, + "entropy": 0.6017354875802994, + "epoch": 0.068, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.38035619258880615, + "kl": 1.1315938919782638, + "learning_rate": 9.941950780970246e-06, + "loss": 0.08799705505371094, + "num_tokens": 18325891.0, + "reward": 1.4051547288894652, + "reward_std": 0.16788736879825591, + "rewards/env_game_reward/mean": 1.4051547288894652, + "rewards/env_game_reward/std": 0.27388512790203096, + "sampling/importance_sampling_ratio/max": 1.4328656196594238, + "sampling/importance_sampling_ratio/mean": 0.8519251465797424, + "sampling/importance_sampling_ratio/min": 0.3052798181772232, + "sampling/sampling_logp_difference/max": 3.1186322927474976, + "sampling/sampling_logp_difference/mean": 0.024020181223750114, + "step": 850, + "step_time": 11.81878617679904 + }, + { + "clip_ratio/high_max": 0.0009455415420234203, + "clip_ratio/high_mean": 0.00047277077101171017, + "clip_ratio/low_mean": 0.000978588976431638, + "clip_ratio/low_min": 0.00048076924867928027, + "clip_ratio/region_mean": 0.0014513597823679447, + "completions/clipped_ratio": 0.025, + "completions/max_length": 2146.6, + "completions/max_terminated_length": 2106.0, + "completions/mean_length": 1544.9875, + "completions/mean_terminated_length": 1529.2891845703125, + "completions/min_length": 1068.0, + "completions/min_terminated_length": 1068.0, + "entropy": 0.5300098687410355, + "epoch": 0.0684, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.15949228405952454, + "kl": 1.1649099051952363, + "learning_rate": 9.941843748047931e-06, + "loss": 0.032023686170578006, + "num_tokens": 18492250.0, + "reward": 1.5154583692550658, + "reward_std": 0.15400618463754653, + "rewards/env_game_reward/mean": 1.5154583692550658, + "rewards/env_game_reward/std": 0.20309044122695924, + "sampling/importance_sampling_ratio/max": 1.7556475162506104, + "sampling/importance_sampling_ratio/mean": 0.9517914056777954, + "sampling/importance_sampling_ratio/min": 0.19819140780014094, + "sampling/sampling_logp_difference/max": 5.317162609100341, + "sampling/sampling_logp_difference/mean": 0.029889048635959627, + "step": 855, + "step_time": 10.00929546920015 + }, + { + "clip_ratio/high_max": 0.002626922621857375, + "clip_ratio/high_mean": 0.0013134613109286875, + "clip_ratio/low_mean": 0.0014400507032405585, + "clip_ratio/low_min": 0.0005416349973529577, + "clip_ratio/region_mean": 0.0027535120607353747, + "completions/clipped_ratio": 0.025, + "completions/max_length": 1999.2, + "completions/max_terminated_length": 1999.2, + "completions/mean_length": 1433.7875, + "completions/mean_terminated_length": 1430.055029296875, + "completions/min_length": 1025.0, + "completions/min_terminated_length": 1025.0, + "entropy": 0.5983447372913361, + "epoch": 0.0688, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.17352095246315002, + "kl": 1.2142629444599151, + "learning_rate": 9.94173606072216e-06, + "loss": 0.03372189402580261, + "num_tokens": 18648334.0, + "reward": 1.4906190633773804, + "reward_std": 0.16036173179745675, + "rewards/env_game_reward/mean": 1.4906190633773804, + "rewards/env_game_reward/std": 0.23361046463251114, + "sampling/importance_sampling_ratio/max": 1.8726997137069703, + "sampling/importance_sampling_ratio/mean": 0.9728065609931946, + "sampling/importance_sampling_ratio/min": 0.2550904452800751, + "sampling/sampling_logp_difference/max": 0.7147324085235596, + "sampling/sampling_logp_difference/mean": 0.021650590375065802, + "step": 860, + "step_time": 8.624581174000923 + }, + { + "clip_ratio/high_max": 0.0015781309455633163, + "clip_ratio/high_mean": 0.0007890654727816582, + "clip_ratio/low_mean": 0.0009416018030606211, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0017306672758422791, + "completions/clipped_ratio": 0.025, + "completions/max_length": 1994.2, + "completions/max_terminated_length": 1990.6, + "completions/mean_length": 1527.225, + "completions/mean_terminated_length": 1523.9850341796875, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.4782690331339836, + "epoch": 0.0692, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.08745373040437698, + "kl": 0.9816187143325805, + "learning_rate": 9.941627719011864e-06, + "loss": 0.003973131626844406, + "num_tokens": 18813877.0, + "reward": 1.4740357398986816, + "reward_std": 0.20445487946271895, + "rewards/env_game_reward/mean": 1.4740357398986816, + "rewards/env_game_reward/std": 0.29361526668071747, + "sampling/importance_sampling_ratio/max": 1.4921527624130249, + "sampling/importance_sampling_ratio/mean": 0.9029739260673523, + "sampling/importance_sampling_ratio/min": 0.15594935725966935, + "sampling/sampling_logp_difference/max": 2.1523058891296385, + "sampling/sampling_logp_difference/mean": 0.01937628984451294, + "step": 865, + "step_time": 8.709527614199033 + }, + { + "clip_ratio/high_max": 0.00011587485205382109, + "clip_ratio/high_mean": 5.7937426026910545e-05, + "clip_ratio/low_mean": 0.0009105266013648361, + "clip_ratio/low_min": 8.110299822874367e-05, + "clip_ratio/region_mean": 0.0009684640273917467, + "completions/clipped_ratio": 0.05, + "completions/max_length": 2151.6, + "completions/max_terminated_length": 2120.2, + "completions/mean_length": 1570.3, + "completions/mean_terminated_length": 1552.0944580078126, + "completions/min_length": 1056.4, + "completions/min_terminated_length": 1056.4, + "entropy": 0.395656031370163, + "epoch": 0.0696, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.145682230591774, + "kl": 0.5095479518175126, + "learning_rate": 9.94151872293609e-06, + "loss": 0.13460533618927, + "num_tokens": 18982505.0, + "reward": 1.5383512496948242, + "reward_std": 0.1430460289120674, + "rewards/env_game_reward/mean": 1.5383512496948242, + "rewards/env_game_reward/std": 0.22576954811811448, + "sampling/importance_sampling_ratio/max": 1.6490776062011718, + "sampling/importance_sampling_ratio/mean": 0.9158960461616517, + "sampling/importance_sampling_ratio/min": 0.18876880555620987, + "sampling/sampling_logp_difference/max": 5.222516059875488, + "sampling/sampling_logp_difference/mean": 0.013656648620963097, + "step": 870, + "step_time": 11.048820288398565 + }, + { + "clip_ratio/high_max": 0.0011350326240062714, + "clip_ratio/high_mean": 0.0005675163120031357, + "clip_ratio/low_mean": 0.0004305612586904317, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009980775532312692, + "completions/clipped_ratio": 0.125, + "completions/max_length": 2096.6, + "completions/max_terminated_length": 2062.8, + "completions/mean_length": 1591.075, + "completions/mean_terminated_length": 1555.29052734375, + "completions/min_length": 1070.6, + "completions/min_terminated_length": 1070.6, + "entropy": 0.4600535213947296, + "epoch": 0.07, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.2988151013851166, + "kl": 0.5012727200984954, + "learning_rate": 9.941409072513995e-06, + "loss": 0.09155293107032776, + "num_tokens": 19152822.0, + "reward": 1.537291669845581, + "reward_std": 0.14348376393318177, + "rewards/env_game_reward/mean": 1.537291669845581, + "rewards/env_game_reward/std": 0.21587826758623124, + "sampling/importance_sampling_ratio/max": 1.8174469709396361, + "sampling/importance_sampling_ratio/mean": 0.8739750266075135, + "sampling/importance_sampling_ratio/min": 0.1697855882086297, + "sampling/sampling_logp_difference/max": 7.956750082969665, + "sampling/sampling_logp_difference/mean": 0.019922800362110138, + "step": 875, + "step_time": 10.96744642199992 + }, + { + "clip_ratio/high_max": 0.0012740743928588926, + "clip_ratio/high_mean": 0.0006370371964294463, + "clip_ratio/low_mean": 0.0002832013618899509, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0009202385641401634, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 2273.2, + "completions/max_terminated_length": 2243.0, + "completions/mean_length": 1785.6375, + "completions/mean_terminated_length": 1753.2542236328125, + "completions/min_length": 1217.2, + "completions/min_terminated_length": 1217.2, + "entropy": 0.4493249997496605, + "epoch": 0.0704, + "frac_reward_zero_std": 0.275, + "grad_norm": 0.15489841997623444, + "kl": 0.4825821310281754, + "learning_rate": 9.941298767764855e-06, + "loss": 0.11126923561096191, + "num_tokens": 19340994.0, + "reward": 1.497991108894348, + "reward_std": 0.18598171770572663, + "rewards/env_game_reward/mean": 1.497991108894348, + "rewards/env_game_reward/std": 0.3028584122657776, + "sampling/importance_sampling_ratio/max": 1.7850000619888307, + "sampling/importance_sampling_ratio/mean": 0.9235016465187073, + "sampling/importance_sampling_ratio/min": 0.24714634816055997, + "sampling/sampling_logp_difference/max": 7.065726852416992, + "sampling/sampling_logp_difference/mean": 0.015392303094267845, + "step": 880, + "step_time": 14.431479327999114 + }, + { + "clip_ratio/high_max": 0.0020738797960802914, + "clip_ratio/high_mean": 0.0011360907810740173, + "clip_ratio/low_mean": 0.00042371204181108625, + "clip_ratio/low_min": 0.00013386880746111274, + "clip_ratio/region_mean": 0.0015598028141539544, + "completions/clipped_ratio": 0.05, + "completions/max_length": 2470.6, + "completions/max_terminated_length": 2435.6, + "completions/mean_length": 1883.7125, + "completions/mean_terminated_length": 1864.0649658203124, + "completions/min_length": 1284.2, + "completions/min_terminated_length": 1284.2, + "entropy": 0.307071553170681, + "epoch": 0.0708, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.1149914562702179, + "kl": 22.70539956986904, + "learning_rate": 9.94118780870806e-06, + "loss": 0.2994292497634888, + "num_tokens": 19534798.0, + "reward": 1.5454092025756836, + "reward_std": 0.21367461532354354, + "rewards/env_game_reward/mean": 1.5454092025756836, + "rewards/env_game_reward/std": 0.29595637023448945, + "sampling/importance_sampling_ratio/max": 1.624759316444397, + "sampling/importance_sampling_ratio/mean": 0.8059692859649659, + "sampling/importance_sampling_ratio/min": 0.06373572412819155, + "sampling/sampling_logp_difference/max": 7.899578714370728, + "sampling/sampling_logp_difference/mean": 0.014438807778060437, + "step": 885, + "step_time": 15.077695120799035 + }, + { + "clip_ratio/high_max": 0.0012833183922339232, + "clip_ratio/high_mean": 0.000692523896577768, + "clip_ratio/low_mean": 0.000726046136696823, + "clip_ratio/low_min": 0.0001866494829300791, + "clip_ratio/region_mean": 0.0014185700129019096, + "completions/clipped_ratio": 0.075, + "completions/max_length": 2363.6, + "completions/max_terminated_length": 2356.4, + "completions/mean_length": 1824.9, + "completions/mean_terminated_length": 1800.4438232421876, + "completions/min_length": 1215.6, + "completions/min_terminated_length": 1215.6, + "entropy": 0.4529956132173538, + "epoch": 0.0712, + "frac_reward_zero_std": 0.225, + "grad_norm": 0.28981900215148926, + "kl": 0.4304444819688797, + "learning_rate": 9.941076195363116e-06, + "loss": 0.01028164178133011, + "num_tokens": 19724009.0, + "reward": 1.5794018268585206, + "reward_std": 0.1288491576910019, + "rewards/env_game_reward/mean": 1.5794018268585206, + "rewards/env_game_reward/std": 0.21744558215141296, + "sampling/importance_sampling_ratio/max": 1.9161789178848267, + "sampling/importance_sampling_ratio/mean": 0.904331374168396, + "sampling/importance_sampling_ratio/min": 3.2518115607157076e-05, + "sampling/sampling_logp_difference/max": 14.280098342895508, + "sampling/sampling_logp_difference/mean": 0.019957776367664336, + "step": 890, + "step_time": 15.076246012999036 + }, + { + "clip_ratio/high_max": 0.001930954516865313, + "clip_ratio/high_mean": 0.0009654772584326565, + "clip_ratio/low_mean": 0.0008705812797416002, + "clip_ratio/low_min": 0.00018939394503831864, + "clip_ratio/region_mean": 0.0018360585323534905, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 2446.4, + "completions/max_terminated_length": 2442.8, + "completions/mean_length": 1835.6875, + "completions/mean_terminated_length": 1819.96455078125, + "completions/min_length": 1285.4, + "completions/min_terminated_length": 1285.4, + "entropy": 0.26241480112075805, + "epoch": 0.0716, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.2046409249305725, + "kl": 0.4811114311218262, + "learning_rate": 9.940963927749643e-06, + "loss": 0.12879633903503418, + "num_tokens": 19914187.0, + "reward": 1.4908958196640014, + "reward_std": 0.27980804443359375, + "rewards/env_game_reward/mean": 1.4908958196640014, + "rewards/env_game_reward/std": 0.3526145279407501, + "sampling/importance_sampling_ratio/max": 1.8963224649429322, + "sampling/importance_sampling_ratio/mean": 0.9690939664840699, + "sampling/importance_sampling_ratio/min": 0.30016586495963227, + "sampling/sampling_logp_difference/max": 2.3270081281661987, + "sampling/sampling_logp_difference/mean": 0.01102782627567649, + "step": 895, + "step_time": 12.819816202801302 + }, + { + "clip_ratio/high_max": 0.0008085624547675252, + "clip_ratio/high_mean": 0.0005102588038425893, + "clip_ratio/low_mean": 0.0005037825962062925, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0010140414000488819, + "completions/clipped_ratio": 0.025, + "completions/max_length": 2384.2, + "completions/max_terminated_length": 2384.2, + "completions/mean_length": 1735.475, + "completions/mean_terminated_length": 1735.693359375, + "completions/min_length": 1147.6, + "completions/min_terminated_length": 1147.6, + "entropy": 0.3156480178236961, + "epoch": 0.072, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.2748168706893921, + "kl": 0.5220715075731277, + "learning_rate": 9.940851005887376e-06, + "loss": -0.02894388735294342, + "num_tokens": 20095788.0, + "reward": 1.4607172727584838, + "reward_std": 0.2872200310230255, + "rewards/env_game_reward/mean": 1.4607172727584838, + "rewards/env_game_reward/std": 0.3808079123497009, + "sampling/importance_sampling_ratio/max": 2.075653338432312, + "sampling/importance_sampling_ratio/mean": 0.9483648419380188, + "sampling/importance_sampling_ratio/min": 0.11939566901285976, + "sampling/sampling_logp_difference/max": 6.860753440856934, + "sampling/sampling_logp_difference/mean": 0.015069704689085483, + "step": 900, + "step_time": 12.859525776599913 + }, + { + "epoch": 0.072, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.041666666666666664, + "eval_completions/max_length": 2727.0, + "eval_completions/max_terminated_length": 2307.3333333333335, + "eval_completions/mean_length": 2059.0, + "eval_completions/mean_terminated_length": 1995.732177734375, + "eval_completions/min_length": 1542.0, + "eval_completions/min_terminated_length": 1542.0, + "eval_entropy": 0.2817843755086263, + "eval_frac_reward_zero_std": 0.16666666666666666, + "eval_kl": 0.3643878698348999, + "eval_loss": -0.0993538573384285, + "eval_num_tokens": 20095788.0, + "eval_reward": 1.457232157389323, + "eval_reward_std": 0.28440003593762714, + "eval_rewards/env_game_reward/mean": 1.457232157389323, + "eval_rewards/env_game_reward/std": 0.3733518322308858, + "eval_runtime": 19.7318, + "eval_samples_per_second": 0.507, + "eval_sampling/importance_sampling_ratio/max": 1.6719778378804524, + "eval_sampling/importance_sampling_ratio/mean": 1.0260129968325298, + "eval_sampling/importance_sampling_ratio/min": 0.3794739445050557, + "eval_sampling/sampling_logp_difference/max": 0.3471987247467041, + "eval_sampling/sampling_logp_difference/mean": 0.007802010824282964, + "eval_steps_per_second": 0.101, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0012189357890747487, + "clip_ratio/high_mean": 0.0007577869633678347, + "clip_ratio/low_mean": 0.0006541514245327562, + "clip_ratio/low_min": 0.0003207181231118739, + "clip_ratio/region_mean": 0.0014119383762590588, + "completions/clipped_ratio": 0.05, + "completions/max_length": 2461.6, + "completions/max_terminated_length": 2448.6, + "completions/mean_length": 1848.575, + "completions/mean_terminated_length": 1824.4354248046875, + "completions/min_length": 1311.6, + "completions/min_terminated_length": 1311.6, + "entropy": 0.3148694097995758, + "epoch": 0.0724, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.19136394560337067, + "kl": 0.43292770683765414, + "learning_rate": 9.940737429796167e-06, + "loss": -0.0005514532327651978, + "num_tokens": 20286731.0, + "reward": 1.5534344911575317, + "reward_std": 0.164444400370121, + "rewards/env_game_reward/mean": 1.5534344911575317, + "rewards/env_game_reward/std": 0.25763210356235505, + "sampling/importance_sampling_ratio/max": 1.920850896835327, + "sampling/importance_sampling_ratio/mean": 0.9820604085922241, + "sampling/importance_sampling_ratio/min": 0.36941553354263307, + "sampling/sampling_logp_difference/max": 3.685489463806152, + "sampling/sampling_logp_difference/mean": 0.012805731128901243, + "step": 905, + "step_time": 13.696737619400665 + }, + { + "clip_ratio/high_max": 0.0015069938148371876, + "clip_ratio/high_mean": 0.000920184439746663, + "clip_ratio/low_mean": 0.00035937174980062994, + "clip_ratio/low_min": 0.00021658835466951132, + "clip_ratio/region_mean": 0.001279556195368059, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 2569.6, + "completions/max_terminated_length": 2511.4, + "completions/mean_length": 1820.3, + "completions/mean_terminated_length": 1806.58505859375, + "completions/min_length": 1275.2, + "completions/min_terminated_length": 1275.2, + "entropy": 0.33324653208255767, + "epoch": 0.0728, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.1512303650379181, + "kl": 0.39021323025226595, + "learning_rate": 9.940623199495979e-06, + "loss": 0.16941314935684204, + "num_tokens": 20474836.0, + "reward": 1.5158392906188964, + "reward_std": 0.2332189589738846, + "rewards/env_game_reward/mean": 1.5158392906188964, + "rewards/env_game_reward/std": 0.31163732558488844, + "sampling/importance_sampling_ratio/max": 2.071822476387024, + "sampling/importance_sampling_ratio/mean": 0.8877307176589966, + "sampling/importance_sampling_ratio/min": 0.07263679296009132, + "sampling/sampling_logp_difference/max": 11.022346949577331, + "sampling/sampling_logp_difference/mean": 0.01563575156033039, + "step": 910, + "step_time": 15.462018063198775 + }, + { + "clip_ratio/high_max": 0.001472691132221371, + "clip_ratio/high_mean": 0.000864570785779506, + "clip_ratio/low_mean": 0.0009776385850273073, + "clip_ratio/low_min": 0.00032047065906226635, + "clip_ratio/region_mean": 0.001842209347523749, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 2470.0, + "completions/max_terminated_length": 2431.2, + "completions/mean_length": 1867.8875, + "completions/mean_terminated_length": 1865.7714599609376, + "completions/min_length": 1331.2, + "completions/min_terminated_length": 1331.2, + "entropy": 0.36800833940505984, + "epoch": 0.0732, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.17658759653568268, + "kl": 0.46147061288356783, + "learning_rate": 9.940508315006892e-06, + "loss": 0.03878949880599976, + "num_tokens": 20666662.0, + "reward": 1.5093273878097535, + "reward_std": 0.2326802283525467, + "rewards/env_game_reward/mean": 1.5093273878097535, + "rewards/env_game_reward/std": 0.32178205251693726, + "sampling/importance_sampling_ratio/max": 1.9555179595947265, + "sampling/importance_sampling_ratio/mean": 0.8527028918266296, + "sampling/importance_sampling_ratio/min": 0.033528552153945776, + "sampling/sampling_logp_difference/max": 10.650108671188354, + "sampling/sampling_logp_difference/mean": 0.022430221550166607, + "step": 915, + "step_time": 14.398402365599031 + }, + { + "clip_ratio/high_max": 0.0011677380418404937, + "clip_ratio/high_mean": 0.0007100923685356975, + "clip_ratio/low_mean": 0.0008342034270754084, + "clip_ratio/low_min": 0.0001377410488203168, + "clip_ratio/region_mean": 0.001544295810163021, + "completions/clipped_ratio": 0.0875, + "completions/max_length": 2436.8, + "completions/max_terminated_length": 2436.8, + "completions/mean_length": 1821.1875, + "completions/mean_terminated_length": 1790.2981689453125, + "completions/min_length": 1247.4, + "completions/min_terminated_length": 1247.4, + "entropy": 0.31378189474344254, + "epoch": 0.0736, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.2656424045562744, + "kl": 0.40927897691726683, + "learning_rate": 9.940392776349104e-06, + "loss": 0.1518398642539978, + "num_tokens": 20854145.0, + "reward": 1.5011369228363036, + "reward_std": 0.19547294974327087, + "rewards/env_game_reward/mean": 1.5011369228363036, + "rewards/env_game_reward/std": 0.3228561282157898, + "sampling/importance_sampling_ratio/max": 1.8403097152709962, + "sampling/importance_sampling_ratio/mean": 0.8172707557678223, + "sampling/importance_sampling_ratio/min": 0.05242107790071242, + "sampling/sampling_logp_difference/max": 10.830488920211792, + "sampling/sampling_logp_difference/mean": 0.01946103759109974, + "step": 920, + "step_time": 15.806952799401916 + }, + { + "clip_ratio/high_max": 0.002922432462219149, + "clip_ratio/high_mean": 0.0016289437422528862, + "clip_ratio/low_mean": 0.0005936998728429899, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.002222643652930856, + "completions/clipped_ratio": 0.0375, + "completions/max_length": 2578.0, + "completions/max_terminated_length": 2346.2, + "completions/mean_length": 1863.5625, + "completions/mean_terminated_length": 1825.928271484375, + "completions/min_length": 1254.6, + "completions/min_terminated_length": 1254.6, + "entropy": 0.2191880613565445, + "epoch": 0.074, + "frac_reward_zero_std": 0.175, + "grad_norm": 0.17762354016304016, + "kl": 0.4934216499328613, + "learning_rate": 9.940276583542922e-06, + "loss": 0.16184706687927247, + "num_tokens": 21046543.0, + "reward": 1.5385030031204223, + "reward_std": 0.19008124768733978, + "rewards/env_game_reward/mean": 1.5385030031204223, + "rewards/env_game_reward/std": 0.3026488572359085, + "sampling/importance_sampling_ratio/max": 1.5807328701019288, + "sampling/importance_sampling_ratio/mean": 0.9147717595100403, + "sampling/importance_sampling_ratio/min": 0.14841571303332562, + "sampling/sampling_logp_difference/max": 7.422147178649903, + "sampling/sampling_logp_difference/mean": 0.011565564665943384, + "step": 925, + "step_time": 13.949015424399112 + }, + { + "clip_ratio/high_max": 0.001220880087930709, + "clip_ratio/high_mean": 0.0006884431757498532, + "clip_ratio/low_mean": 0.0010157813434489072, + "clip_ratio/low_min": 0.0006492685759440064, + "clip_ratio/region_mean": 0.0017042245075572283, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2307.2, + "completions/max_terminated_length": 2307.2, + "completions/mean_length": 1872.1625, + "completions/mean_terminated_length": 1870.493359375, + "completions/min_length": 1372.4, + "completions/min_terminated_length": 1372.4, + "entropy": 0.3217353641986847, + "epoch": 0.0744, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.1309574842453003, + "kl": 0.5329782962799072, + "learning_rate": 9.940159736608773e-06, + "loss": 0.003825952857732773, + "num_tokens": 21240177.0, + "reward": 1.5249047756195069, + "reward_std": 0.1893110007047653, + "rewards/env_game_reward/mean": 1.5249047756195069, + "rewards/env_game_reward/std": 0.3156297117471695, + "sampling/importance_sampling_ratio/max": 1.7126131534576416, + "sampling/importance_sampling_ratio/mean": 0.8812451601028443, + "sampling/importance_sampling_ratio/min": 0.09435489480078894, + "sampling/sampling_logp_difference/max": 12.496303129196168, + "sampling/sampling_logp_difference/mean": 0.022152267023921014, + "step": 930, + "step_time": 12.41166779679843 + }, + { + "clip_ratio/high_max": 0.0024893431225791575, + "clip_ratio/high_mean": 0.0015371947665698826, + "clip_ratio/low_mean": 0.000855160562787205, + "clip_ratio/low_min": 0.00016750418581068516, + "clip_ratio/region_mean": 0.0023923553293570877, + "completions/clipped_ratio": 0.025, + "completions/max_length": 2308.8, + "completions/max_terminated_length": 2303.8, + "completions/mean_length": 1734.975, + "completions/mean_terminated_length": 1722.290869140625, + "completions/min_length": 1180.6, + "completions/min_terminated_length": 1180.6, + "entropy": 0.3756895184516907, + "epoch": 0.0748, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.22695200145244598, + "kl": 0.57938232421875, + "learning_rate": 9.940042235567198e-06, + "loss": 0.06497732996940613, + "num_tokens": 21421312.0, + "reward": 1.5562530040740967, + "reward_std": 0.17508048713207244, + "rewards/env_game_reward/mean": 1.5562530040740967, + "rewards/env_game_reward/std": 0.2580333352088928, + "sampling/importance_sampling_ratio/max": 1.9110926389694214, + "sampling/importance_sampling_ratio/mean": 0.9796088457107544, + "sampling/importance_sampling_ratio/min": 0.1705502788976498, + "sampling/sampling_logp_difference/max": 7.984076070785522, + "sampling/sampling_logp_difference/mean": 0.01644737496972084, + "step": 935, + "step_time": 11.930417591598962 + }, + { + "clip_ratio/high_max": 0.003084265359211713, + "clip_ratio/high_mean": 0.0018833025882486255, + "clip_ratio/low_mean": 0.0007503544562496245, + "clip_ratio/low_min": 0.00014577260008081795, + "clip_ratio/region_mean": 0.00263365704449825, + "completions/clipped_ratio": 0.025, + "completions/max_length": 2340.0, + "completions/max_terminated_length": 2340.0, + "completions/mean_length": 1727.275, + "completions/mean_terminated_length": 1716.6592041015624, + "completions/min_length": 1215.2, + "completions/min_terminated_length": 1215.2, + "entropy": 0.31620822846889496, + "epoch": 0.0752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24355845153331757, + "kl": 0.6145551562309265, + "learning_rate": 9.939924080438852e-06, + "loss": 0.11798319816589356, + "num_tokens": 21601758.0, + "reward": 1.5743244171142579, + "reward_std": 0.15734387189149857, + "rewards/env_game_reward/mean": 1.5743244171142579, + "rewards/env_game_reward/std": 0.22823202311992646, + "sampling/importance_sampling_ratio/max": 2.0933955669403077, + "sampling/importance_sampling_ratio/mean": 1.0007439851760864, + "sampling/importance_sampling_ratio/min": 0.3973370999097824, + "sampling/sampling_logp_difference/max": 0.4600968599319458, + "sampling/sampling_logp_difference/mean": 0.011099493503570557, + "step": 940, + "step_time": 11.759782355798233 + }, + { + "clip_ratio/high_max": 0.001626867160666734, + "clip_ratio/high_mean": 0.0009445225528907031, + "clip_ratio/low_mean": 0.0014562467520590871, + "clip_ratio/low_min": 0.00011627906933426857, + "clip_ratio/region_mean": 0.00240076927584596, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2321.2, + "completions/max_terminated_length": 2321.2, + "completions/mean_length": 1699.95, + "completions/mean_terminated_length": 1694.1233642578125, + "completions/min_length": 1174.8, + "completions/min_terminated_length": 1174.8, + "entropy": 0.2677352011203766, + "epoch": 0.0756, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.15144741535186768, + "kl": 0.6979381173849106, + "learning_rate": 9.939805271244503e-06, + "loss": 0.08515205383300781, + "num_tokens": 21781002.0, + "reward": 1.4935282707214355, + "reward_std": 0.18328165709972383, + "rewards/env_game_reward/mean": 1.4935282707214355, + "rewards/env_game_reward/std": 0.2772886216640472, + "sampling/importance_sampling_ratio/max": 1.7501282215118408, + "sampling/importance_sampling_ratio/mean": 0.8283731698989868, + "sampling/importance_sampling_ratio/min": 0.06416170225804453, + "sampling/sampling_logp_difference/max": 12.058596181869508, + "sampling/sampling_logp_difference/mean": 0.016518401354551314, + "step": 945, + "step_time": 12.266804081601004 + }, + { + "clip_ratio/high_max": 0.0045323959086090325, + "clip_ratio/high_mean": 0.0026198809733614325, + "clip_ratio/low_mean": 0.0007645079400390387, + "clip_ratio/low_min": 0.0003045685356482863, + "clip_ratio/region_mean": 0.0033843889308627696, + "completions/clipped_ratio": 0.05, + "completions/max_length": 2202.6, + "completions/max_terminated_length": 2183.0, + "completions/mean_length": 1666.2375, + "completions/mean_terminated_length": 1642.8640625, + "completions/min_length": 1118.0, + "completions/min_terminated_length": 1118.0, + "entropy": 0.2549864038825035, + "epoch": 0.076, + "frac_reward_zero_std": 0.1, + "grad_norm": 0.09294123202562332, + "kl": 0.7737713813781738, + "learning_rate": 9.939685808005038e-06, + "loss": -0.01393904983997345, + "num_tokens": 21955811.0, + "reward": 1.477282738685608, + "reward_std": 0.21622737050056456, + "rewards/env_game_reward/mean": 1.477282738685608, + "rewards/env_game_reward/std": 0.34079847633838656, + "sampling/importance_sampling_ratio/max": 1.767937183380127, + "sampling/importance_sampling_ratio/mean": 0.9298573493957519, + "sampling/importance_sampling_ratio/min": 0.14373855862080057, + "sampling/sampling_logp_difference/max": 9.330408585071563, + "sampling/sampling_logp_difference/mean": 0.014421308785676957, + "step": 950, + "step_time": 11.463380426800722 + }, + { + "clip_ratio/high_max": 0.005439925438258797, + "clip_ratio/high_mean": 0.003208035236457363, + "clip_ratio/low_mean": 0.0033877444453537463, + "clip_ratio/low_min": 0.0006987016880884766, + "clip_ratio/region_mean": 0.006595779792405665, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2315.6, + "completions/max_terminated_length": 2270.6, + "completions/mean_length": 1716.55, + "completions/mean_terminated_length": 1707.9341796875, + "completions/min_length": 1110.8, + "completions/min_terminated_length": 1110.8, + "entropy": 0.3051726818084717, + "epoch": 0.0764, + "frac_reward_zero_std": 0.05, + "grad_norm": 0.10971368849277496, + "kl": 0.9875833690166473, + "learning_rate": 9.939565690741458e-06, + "loss": 0.011976981163024902, + "num_tokens": 22135718.0, + "reward": 1.409833312034607, + "reward_std": 0.2909154981374741, + "rewards/env_game_reward/mean": 1.409833312034607, + "rewards/env_game_reward/std": 0.45696545839309693, + "sampling/importance_sampling_ratio/max": 1.5724833250045775, + "sampling/importance_sampling_ratio/mean": 0.781300175189972, + "sampling/importance_sampling_ratio/min": 3.93051331259997e-06, + "sampling/sampling_logp_difference/max": 12.314488220214844, + "sampling/sampling_logp_difference/mean": 0.04921445026993752, + "step": 955, + "step_time": 10.660027011402416 + }, + { + "clip_ratio/high_max": 0.0050688618794083595, + "clip_ratio/high_mean": 0.0030160827096551656, + "clip_ratio/low_mean": 0.0024181493849027903, + "clip_ratio/low_min": 0.001296406053006649, + "clip_ratio/region_mean": 0.005434232112020254, + "completions/clipped_ratio": 0.0125, + "completions/max_length": 2306.6, + "completions/max_terminated_length": 2299.2, + "completions/mean_length": 1687.0375, + "completions/mean_terminated_length": 1679.5216796875, + "completions/min_length": 1124.2, + "completions/min_terminated_length": 1124.2, + "entropy": 0.2659656837582588, + "epoch": 0.0768, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.07717063277959824, + "kl": 0.867502224445343, + "learning_rate": 9.939444919474875e-06, + "loss": 0.03871417641639709, + "num_tokens": 22313392.0, + "reward": 1.432592248916626, + "reward_std": 0.2927884966135025, + "rewards/env_game_reward/mean": 1.432592248916626, + "rewards/env_game_reward/std": 0.41850276589393615, + "sampling/importance_sampling_ratio/max": 1.9147189617156983, + "sampling/importance_sampling_ratio/mean": 0.6610526800155639, + "sampling/importance_sampling_ratio/min": 6.190264483464527e-11, + "sampling/sampling_logp_difference/max": 19.166898345947267, + "sampling/sampling_logp_difference/mean": 0.08357204496860504, + "step": 960, + "step_time": 11.269645843000763 + }, + { + "clip_ratio/high_max": 0.00402466943487525, + "clip_ratio/high_mean": 0.002358415606431663, + "clip_ratio/low_mean": 0.0015746165940072388, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0039330321713350715, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2263.4, + "completions/max_terminated_length": 2263.4, + "completions/mean_length": 1771.7, + "completions/mean_terminated_length": 1771.7, + "completions/min_length": 1157.8, + "completions/min_terminated_length": 1157.8, + "entropy": 0.3173187494277954, + "epoch": 0.0772, + "frac_reward_zero_std": 0.2, + "grad_norm": 0.08260554075241089, + "kl": 0.7630613714456558, + "learning_rate": 9.939323494226523e-06, + "loss": -0.0028585586696863174, + "num_tokens": 22499063.0, + "reward": 1.5061264991760255, + "reward_std": 0.22805666625499726, + "rewards/env_game_reward/mean": 1.5061264991760255, + "rewards/env_game_reward/std": 0.37998464703559875, + "sampling/importance_sampling_ratio/max": 1.352769374847412, + "sampling/importance_sampling_ratio/mean": 0.7304522514343261, + "sampling/importance_sampling_ratio/min": 0.05258167387152959, + "sampling/sampling_logp_difference/max": 17.498226356506347, + "sampling/sampling_logp_difference/mean": 0.06935336329042911, + "step": 965, + "step_time": 10.030467351600965 + }, + { + "clip_ratio/high_max": 0.0029599617468193174, + "clip_ratio/high_mean": 0.0017886228044517339, + "clip_ratio/low_mean": 0.002303900709375739, + "clip_ratio/low_min": 0.0009720305213704705, + "clip_ratio/region_mean": 0.004092523554572835, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2316.2, + "completions/max_terminated_length": 2316.2, + "completions/mean_length": 1650.2125, + "completions/mean_terminated_length": 1650.2125, + "completions/min_length": 1100.6, + "completions/min_terminated_length": 1100.6, + "entropy": 0.31040072441101074, + "epoch": 0.0776, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.17387092113494873, + "kl": 0.7710592210292816, + "learning_rate": 9.939201415017744e-06, + "loss": -0.057826101779937744, + "num_tokens": 22674597.0, + "reward": 1.4797053575515746, + "reward_std": 0.27919352650642393, + "rewards/env_game_reward/mean": 1.4797053575515746, + "rewards/env_game_reward/std": 0.37522571682929995, + "sampling/importance_sampling_ratio/max": 1.5621517896652222, + "sampling/importance_sampling_ratio/mean": 0.6350171446800232, + "sampling/importance_sampling_ratio/min": 1.4516258057226337e-12, + "sampling/sampling_logp_difference/max": 24.295112991333006, + "sampling/sampling_logp_difference/mean": 0.09522517807781697, + "step": 970, + "step_time": 10.581186058397725 + }, + { + "clip_ratio/high_max": 0.0035527752072084693, + "clip_ratio/high_mean": 0.0019544816430425273, + "clip_ratio/low_mean": 0.0014669578959001228, + "clip_ratio/low_min": 0.00055584826041013, + "clip_ratio/region_mean": 0.0034214395738672463, + "completions/clipped_ratio": 0.05, + "completions/max_length": 2394.8, + "completions/max_terminated_length": 2261.6, + "completions/mean_length": 1739.1625, + "completions/mean_terminated_length": 1705.8567138671874, + "completions/min_length": 1138.8, + "completions/min_terminated_length": 1138.8, + "entropy": 0.450977149605751, + "epoch": 0.078, + "frac_reward_zero_std": 0.075, + "grad_norm": 0.13805274665355682, + "kl": 0.7013565450906754, + "learning_rate": 9.93907868187e-06, + "loss": -0.0027586638927459715, + "num_tokens": 22857731.0, + "reward": 1.4618571519851684, + "reward_std": 0.28324676156044004, + "rewards/env_game_reward/mean": 1.4618571519851684, + "rewards/env_game_reward/std": 0.3617980420589447, + "sampling/importance_sampling_ratio/max": 1.79449143409729, + "sampling/importance_sampling_ratio/mean": 0.637387079000473, + "sampling/importance_sampling_ratio/min": 3.107777472507138e-14, + "sampling/sampling_logp_difference/max": 22.73606185913086, + "sampling/sampling_logp_difference/mean": 0.08924243450164795, + "step": 975, + "step_time": 12.915631853001106 + }, + { + "epoch": 0.078, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.08333333333333333, + "eval_completions/max_length": 2266.0, + "eval_completions/max_terminated_length": 2250.0, + "eval_completions/mean_length": 1855.9166666666667, + "eval_completions/mean_terminated_length": 1823.6310221354167, + "eval_completions/min_length": 1335.3333333333333, + "eval_completions/min_terminated_length": 1335.3333333333333, + "eval_entropy": 0.3405195375283559, + "eval_frac_reward_zero_std": 0.3333333333333333, + "eval_kl": 0.5966706077257792, + "eval_loss": -0.01148859690874815, + "eval_num_tokens": 22857731.0, + "eval_reward": 1.6079761981964111, + "eval_reward_std": 0.11246365184585254, + "eval_rewards/env_game_reward/mean": 1.6079761981964111, + "eval_rewards/env_game_reward/std": 0.1685823400815328, + "eval_runtime": 16.002, + "eval_samples_per_second": 0.625, + "eval_sampling/importance_sampling_ratio/max": 1.5225164890289307, + "eval_sampling/importance_sampling_ratio/mean": 0.5733944574991862, + "eval_sampling/importance_sampling_ratio/min": 2.351474487362804e-12, + "eval_sampling/sampling_logp_difference/max": 23.101406733194988, + "eval_sampling/sampling_logp_difference/mean": 0.09195188557108243, + "eval_steps_per_second": 0.125, + "step": 975 + } + ], + "logging_steps": 5, + "max_steps": 37500, + "num_input_tokens_seen": 22857731, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}