diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,59513 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 3120, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8497.0, + "completions/mean_length": 7881.193359375, + "completions/mean_terminated_length": 1474.9691162109375, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.14000436663627625, + "epoch": 0.0016025641025641025, + "frac_reward_zero_std": 0.03125, + "grad_norm": 568.662109375, + "learning_rate": 1e-06, + "loss": 0.274, + "num_tokens": 4883523.0, + "reward": 0.28620433807373047, + "reward_std": 0.19634175300598145, + "rewards/progression_diversity/mean": -0.09636208415031433, + "rewards/progression_diversity/std": 0.1318667083978653, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.5490233898162842, + "rewards/symbolic_reward_partial_score/std": 0.4077247977256775, + "rewards/tag_count_reward/mean": -0.369140625, + "rewards/tag_count_reward/std": 0.4830440282821655, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890698790550232, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 412.0, + "sampling/sampling_logp_difference/mean": 11.238412857055664, + "step": 1 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.15292271971702576, + "epoch": 0.003205128205128205, + "grad_norm": 1453.4033203125, + "learning_rate": 1e-06, + "loss": 0.2567, + "step": 2 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.1447555311024189, + "epoch": 0.004807692307692308, + "grad_norm": 155.9544219970703, + "learning_rate": 1e-06, + "loss": 0.2182, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3828125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.15076642483472824, + "epoch": 0.00641025641025641, + "grad_norm": 180.8573455810547, + "learning_rate": 1e-06, + "loss": 0.2647, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.47265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8724.0, + "completions/mean_length": 8531.75, + "completions/mean_terminated_length": 1493.807373046875, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "entropy": 0.141545832157135, + "epoch": 0.008012820512820512, + "frac_reward_zero_std": 0.0, + "grad_norm": 267.66961669921875, + "learning_rate": 1e-06, + "loss": 0.1422, + "num_tokens": 10076899.0, + "reward": 0.27696967124938965, + "reward_std": 0.21476207673549652, + "rewards/progression_diversity/mean": -0.10186256468296051, + "rewards/progression_diversity/std": 0.1328817903995514, + "rewards/symbolic_reward_accuracy/mean": 0.267578125, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.5268880128860474, + "rewards/symbolic_reward_partial_score/std": 0.4123326241970062, + "rewards/tag_count_reward/mean": -0.40625, + "rewards/tag_count_reward/std": 0.49161264300346375, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984929621219635, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 412.0, + "sampling/sampling_logp_difference/mean": 11.871856689453125, + "step": 5 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.14717822521924973, + "epoch": 0.009615384615384616, + "grad_norm": 116.89068603515625, + "learning_rate": 1e-06, + "loss": 0.2298, + "step": 6 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.13329345732927322, + "epoch": 0.011217948717948718, + "grad_norm": 971.0422973632812, + "learning_rate": 1e-06, + "loss": 0.3372, + "step": 7 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.15128736197948456, + "epoch": 0.01282051282051282, + "grad_norm": 416.9343566894531, + "learning_rate": 1e-06, + "loss": 0.3162, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.44140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8558.0, + "completions/mean_length": 8034.72265625, + "completions/mean_terminated_length": 1437.0419921875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.12974420934915543, + "epoch": 0.014423076923076924, + "frac_reward_zero_std": 0.0, + "grad_norm": 525.6014404296875, + "learning_rate": 1e-06, + "loss": 0.3298, + "num_tokens": 15093045.0, + "reward": 0.22926117479801178, + "reward_std": 0.21232059597969055, + "rewards/progression_diversity/mean": -0.10122619569301605, + "rewards/progression_diversity/std": 0.1385258138179779, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.5384114980697632, + "rewards/symbolic_reward_partial_score/std": 0.38595736026763916, + "rewards/tag_count_reward/mean": -0.390625, + "rewards/tag_count_reward/std": 0.48836761713027954, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9804725050926208, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 412.0, + "sampling/sampling_logp_difference/mean": 12.684000968933105, + "step": 9 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.1481311321258545, + "epoch": 0.016025641025641024, + "grad_norm": 207.61981201171875, + "learning_rate": 1e-06, + "loss": 0.2655, + "step": 10 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.14685294777154922, + "epoch": 0.017628205128205128, + "grad_norm": 452.76580810546875, + "learning_rate": 1e-06, + "loss": 0.3486, + "step": 11 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.13027992472052574, + "epoch": 0.019230769230769232, + "grad_norm": 58.394657135009766, + "learning_rate": 1e-06, + "loss": 0.3177, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.384765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4718.0, + "completions/mean_length": 7190.44921875, + "completions/mean_terminated_length": 1440.831787109375, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.145263209939003, + "epoch": 0.020833333333333332, + "frac_reward_zero_std": 0.0, + "grad_norm": 442.4850158691406, + "learning_rate": 1e-06, + "loss": 0.2593, + "num_tokens": 19648011.0, + "reward": 0.234406977891922, + "reward_std": 0.19648107886314392, + "rewards/progression_diversity/mean": -0.08762294054031372, + "rewards/progression_diversity/std": 0.1332884430885315, + "rewards/symbolic_reward_accuracy/mean": 0.16796875, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.5577148199081421, + "rewards/symbolic_reward_partial_score/std": 0.37337514758110046, + "rewards/tag_count_reward/mean": -0.328125, + "rewards/tag_count_reward/std": 0.4699897766113281, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.983834981918335, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 414.0, + "sampling/sampling_logp_difference/mean": 12.08355712890625, + "step": 13 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.14843080937862396, + "epoch": 0.022435897435897436, + "grad_norm": 94.6893081665039, + "learning_rate": 1e-06, + "loss": 0.2235, + "step": 14 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.16313014179468155, + "epoch": 0.02403846153846154, + "grad_norm": 207.52377319335938, + "learning_rate": 1e-06, + "loss": 0.2741, + "step": 15 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.15617449581623077, + "epoch": 0.02564102564102564, + "grad_norm": 304.16375732421875, + "learning_rate": 1e-06, + "loss": 0.3583, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.349609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8593.0, + "completions/mean_length": 6609.98046875, + "completions/mean_terminated_length": 1356.078125, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.20762895047664642, + "epoch": 0.027243589743589744, + "frac_reward_zero_std": 0.0625, + "grad_norm": 390.9715576171875, + "learning_rate": 1e-06, + "loss": 0.2115, + "num_tokens": 23865953.0, + "reward": 0.32011666893959045, + "reward_std": 0.19630393385887146, + "rewards/progression_diversity/mean": -0.08647972345352173, + "rewards/progression_diversity/std": 0.1337437778711319, + "rewards/symbolic_reward_accuracy/mean": 0.28515625, + "rewards/symbolic_reward_accuracy/std": 0.45193037390708923, + "rewards/symbolic_reward_partial_score/mean": 0.6044433116912842, + "rewards/symbolic_reward_partial_score/std": 0.38585761189460754, + "rewards/tag_count_reward/mean": -0.314453125, + "rewards/tag_count_reward/std": 0.4647517800331116, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.987565815448761, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 416.0, + "sampling/sampling_logp_difference/mean": 11.790186882019043, + "step": 17 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.16485845297574997, + "epoch": 0.028846153846153848, + "grad_norm": 20.569774627685547, + "learning_rate": 1e-06, + "loss": 0.3498, + "step": 18 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.14989716559648514, + "epoch": 0.030448717948717948, + "grad_norm": 29.013126373291016, + "learning_rate": 1e-06, + "loss": 0.2945, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.16761308163404465, + "epoch": 0.03205128205128205, + "grad_norm": 17.99753761291504, + "learning_rate": 1e-06, + "loss": 0.2861, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.396484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15963.0, + "completions/mean_length": 7455.453125, + "completions/mean_terminated_length": 1589.7735595703125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.1430811509490013, + "epoch": 0.03365384615384615, + "frac_reward_zero_std": 0.0, + "grad_norm": 338.8027648925781, + "learning_rate": 1e-06, + "loss": 0.3612, + "num_tokens": 28525033.0, + "reward": 0.2473517507314682, + "reward_std": 0.21615710854530334, + "rewards/progression_diversity/mean": -0.09685845673084259, + "rewards/progression_diversity/std": 0.13814808428287506, + "rewards/symbolic_reward_accuracy/mean": 0.193359375, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.5562499761581421, + "rewards/symbolic_reward_partial_score/std": 0.38326844573020935, + "rewards/tag_count_reward/mean": -0.345703125, + "rewards/tag_count_reward/std": 0.4760620892047882, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782972931861877, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 416.0, + "sampling/sampling_logp_difference/mean": 13.17431640625, + "step": 21 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.15020054578781128, + "epoch": 0.035256410256410256, + "grad_norm": 72.29728698730469, + "learning_rate": 1e-06, + "loss": 0.2999, + "step": 22 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.14304012060165405, + "epoch": 0.03685897435897436, + "grad_norm": 74.60310363769531, + "learning_rate": 1e-06, + "loss": 0.2514, + "step": 23 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.1455524042248726, + "epoch": 0.038461538461538464, + "grad_norm": 23.46770668029785, + "learning_rate": 1e-06, + "loss": 0.2941, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.431640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8141.0, + "completions/mean_length": 7897.568359375, + "completions/mean_terminated_length": 1452.54638671875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.15165143460035324, + "epoch": 0.04006410256410257, + "frac_reward_zero_std": 0.03125, + "grad_norm": 739.3831176757812, + "learning_rate": 1e-06, + "loss": 0.2292, + "num_tokens": 33441724.0, + "reward": 0.2494029402732849, + "reward_std": 0.1849609613418579, + "rewards/progression_diversity/mean": -0.10511670261621475, + "rewards/progression_diversity/std": 0.14027173817157745, + "rewards/symbolic_reward_accuracy/mean": 0.208984375, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.5431803464889526, + "rewards/symbolic_reward_partial_score/std": 0.38958677649497986, + "rewards/tag_count_reward/mean": -0.37890625, + "rewards/tag_count_reward/std": 0.4855891764163971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9824032783508301, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 416.0, + "sampling/sampling_logp_difference/mean": 12.627696990966797, + "step": 25 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.14603624492883682, + "epoch": 0.041666666666666664, + "grad_norm": 473.4931335449219, + "learning_rate": 1e-06, + "loss": 0.2678, + "step": 26 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3515625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.13400625064969063, + "epoch": 0.04326923076923077, + "grad_norm": 485.1720886230469, + "learning_rate": 1e-06, + "loss": 0.3149, + "step": 27 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.14134199172258377, + "epoch": 0.04487179487179487, + "grad_norm": 265.03302001953125, + "learning_rate": 1e-06, + "loss": 0.2932, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5220.0, + "completions/mean_length": 8731.86328125, + "completions/mean_terminated_length": 1543.492431640625, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "entropy": 0.12327419593930244, + "epoch": 0.046474358974358976, + "frac_reward_zero_std": 0.03125, + "grad_norm": 270.61407470703125, + "learning_rate": 1e-06, + "loss": 0.2511, + "num_tokens": 38830534.0, + "reward": 0.20257076621055603, + "reward_std": 0.1862163543701172, + "rewards/progression_diversity/mean": -0.11499577015638351, + "rewards/progression_diversity/std": 0.14146001636981964, + "rewards/symbolic_reward_accuracy/mean": 0.166015625, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.49026691913604736, + "rewards/symbolic_reward_partial_score/std": 0.3924698829650879, + "rewards/tag_count_reward/mean": -0.4296875, + "rewards/tag_count_reward/std": 0.4955156147480011, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978987991809845, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 416.0, + "sampling/sampling_logp_difference/mean": 13.129805564880371, + "step": 29 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.13143375515937805, + "epoch": 0.04807692307692308, + "grad_norm": 348.0553283691406, + "learning_rate": 1e-06, + "loss": 0.2894, + "step": 30 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.1378588154911995, + "epoch": 0.049679487179487176, + "grad_norm": 297.4830322265625, + "learning_rate": 1e-06, + "loss": 0.2873, + "step": 31 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.4765625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.11010397225618362, + "epoch": 0.05128205128205128, + "grad_norm": 120.72930145263672, + "learning_rate": 1e-06, + "loss": 0.3805, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.412109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10000.0, + "completions/mean_length": 7616.541015625, + "completions/mean_terminated_length": 1470.5814208984375, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.15361008048057556, + "epoch": 0.052884615384615384, + "frac_reward_zero_std": 0.09375, + "grad_norm": 288.5027160644531, + "learning_rate": 1e-06, + "loss": 0.2605, + "num_tokens": 43573467.0, + "reward": 0.26951634883880615, + "reward_std": 0.16951517760753632, + "rewards/progression_diversity/mean": -0.10158812254667282, + "rewards/progression_diversity/std": 0.14149615168571472, + "rewards/symbolic_reward_accuracy/mean": 0.236328125, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.549560546875, + "rewards/symbolic_reward_partial_score/std": 0.393799364566803, + "rewards/tag_count_reward/mean": -0.361328125, + "rewards/tag_count_reward/std": 0.48085519671440125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9846019744873047, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 420.0, + "sampling/sampling_logp_difference/mean": 12.345019340515137, + "step": 33 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.1322014182806015, + "epoch": 0.05448717948717949, + "grad_norm": 719.1787719726562, + "learning_rate": 1e-06, + "loss": 0.2974, + "step": 34 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.1471644788980484, + "epoch": 0.05608974358974359, + "grad_norm": 506.037109375, + "learning_rate": 1e-06, + "loss": 0.2677, + "step": 35 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.144605353474617, + "epoch": 0.057692307692307696, + "grad_norm": 172.5960693359375, + "learning_rate": 1e-06, + "loss": 0.2405, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.376953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8070.0, + "completions/mean_length": 7113.50390625, + "completions/mean_terminated_length": 1504.7083740234375, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "entropy": 0.15522907674312592, + "epoch": 0.05929487179487179, + "frac_reward_zero_std": 0.0625, + "grad_norm": 546.6926879882812, + "learning_rate": 1e-06, + "loss": 0.314, + "num_tokens": 48071261.0, + "reward": 0.29389289021492004, + "reward_std": 0.20416641235351562, + "rewards/progression_diversity/mean": -0.09996913373470306, + "rewards/progression_diversity/std": 0.1435985416173935, + "rewards/symbolic_reward_accuracy/mean": 0.25390625, + "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, + "rewards/symbolic_reward_partial_score/mean": 0.5851888656616211, + "rewards/symbolic_reward_partial_score/std": 0.38423076272010803, + "rewards/tag_count_reward/mean": -0.330078125, + "rewards/tag_count_reward/std": 0.47070086002349854, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879682660102844, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 420.0, + "sampling/sampling_logp_difference/mean": 12.075263023376465, + "step": 37 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.1799706146121025, + "epoch": 0.060897435897435896, + "grad_norm": 217.79710388183594, + "learning_rate": 1e-06, + "loss": 0.2229, + "step": 38 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.1559307500720024, + "epoch": 0.0625, + "grad_norm": 349.0166931152344, + "learning_rate": 1e-06, + "loss": 0.2719, + "step": 39 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.15247832983732224, + "epoch": 0.0641025641025641, + "grad_norm": 172.07949829101562, + "learning_rate": 1e-06, + "loss": 0.319, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.427734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4535.0, + "completions/mean_length": 7818.34375, + "completions/mean_terminated_length": 1416.0272216796875, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "entropy": 0.13341034203767776, + "epoch": 0.06570512820512821, + "frac_reward_zero_std": 0.03125, + "grad_norm": 447.7645568847656, + "learning_rate": 1e-06, + "loss": 0.273, + "num_tokens": 53003245.0, + "reward": 0.218738853931427, + "reward_std": 0.2060059905052185, + "rewards/progression_diversity/mean": -0.1119544506072998, + "rewards/progression_diversity/std": 0.1507621854543686, + "rewards/symbolic_reward_accuracy/mean": 0.171875, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.5076009035110474, + "rewards/symbolic_reward_partial_score/std": 0.3918800354003906, + "rewards/tag_count_reward/mean": -0.35546875, + "rewards/tag_count_reward/std": 0.47912323474884033, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9806472659111023, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 420.0, + "sampling/sampling_logp_difference/mean": 13.268798828125, + "step": 41 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.14531062543392181, + "epoch": 0.0673076923076923, + "grad_norm": 1331.5009765625, + "learning_rate": 1e-06, + "loss": 0.3872, + "step": 42 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.1627102643251419, + "epoch": 0.06891025641025642, + "grad_norm": 724.1676025390625, + "learning_rate": 1e-06, + "loss": 0.3012, + "step": 43 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.16679170727729797, + "epoch": 0.07051282051282051, + "grad_norm": 689.5390625, + "learning_rate": 1e-06, + "loss": 0.3947, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.369140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4762.0, + "completions/mean_length": 6941.892578125, + "completions/mean_terminated_length": 1416.9442138671875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.1360173150897026, + "epoch": 0.07211538461538461, + "frac_reward_zero_std": 0.0, + "grad_norm": 262.25537109375, + "learning_rate": 1e-06, + "loss": 0.3696, + "num_tokens": 57407846.0, + "reward": 0.32967936992645264, + "reward_std": 0.2605833411216736, + "rewards/progression_diversity/mean": -0.10432835668325424, + "rewards/progression_diversity/std": 0.15316592156887054, + "rewards/symbolic_reward_accuracy/mean": 0.302734375, + "rewards/symbolic_reward_accuracy/std": 0.45989060401916504, + "rewards/symbolic_reward_partial_score/mean": 0.6082682609558105, + "rewards/symbolic_reward_partial_score/std": 0.3930491805076599, + "rewards/tag_count_reward/mean": -0.333984375, + "rewards/tag_count_reward/std": 0.47209542989730835, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9809261560440063, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 422.0, + "sampling/sampling_logp_difference/mean": 13.200485229492188, + "step": 45 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.1455419659614563, + "epoch": 0.07371794871794872, + "grad_norm": 1122.5438232421875, + "learning_rate": 1e-06, + "loss": 0.2685, + "step": 46 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.17444385588169098, + "epoch": 0.07532051282051282, + "grad_norm": 311.899169921875, + "learning_rate": 1e-06, + "loss": 0.2814, + "step": 47 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.17196480184793472, + "epoch": 0.07692307692307693, + "grad_norm": 131.08229064941406, + "learning_rate": 1e-06, + "loss": 0.2203, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.41796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4885.0, + "completions/mean_length": 7645.94921875, + "completions/mean_terminated_length": 1370.97314453125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.14967508614063263, + "epoch": 0.07852564102564102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1408.8106689453125, + "learning_rate": 1e-06, + "loss": 0.2864, + "num_tokens": 62251772.0, + "reward": 0.27286070585250854, + "reward_std": 0.2150975465774536, + "rewards/progression_diversity/mean": -0.11773853003978729, + "rewards/progression_diversity/std": 0.15623939037322998, + "rewards/symbolic_reward_accuracy/mean": 0.251953125, + "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, + "rewards/symbolic_reward_partial_score/mean": 0.5332520008087158, + "rewards/symbolic_reward_partial_score/std": 0.40305641293525696, + "rewards/tag_count_reward/mean": -0.37109375, + "rewards/tag_count_reward/std": 0.4835699498653412, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9803931713104248, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 424.0, + "sampling/sampling_logp_difference/mean": 13.439233779907227, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.1299237497150898, + "epoch": 0.08012820512820513, + "grad_norm": 226.3897705078125, + "learning_rate": 1e-06, + "loss": 0.3439, + "step": 50 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.14567925781011581, + "epoch": 0.08173076923076923, + "grad_norm": 225.25894165039062, + "learning_rate": 1e-06, + "loss": 0.227, + "step": 51 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.16078957170248032, + "epoch": 0.08333333333333333, + "grad_norm": 259.5469665527344, + "learning_rate": 1e-06, + "loss": 0.2553, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.384765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14739.0, + "completions/mean_length": 7208.03515625, + "completions/mean_terminated_length": 1469.4158935546875, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.16834494471549988, + "epoch": 0.08493589743589744, + "frac_reward_zero_std": 0.03125, + "grad_norm": 538.5927734375, + "learning_rate": 1e-06, + "loss": 0.2477, + "num_tokens": 66784174.0, + "reward": 0.3087325990200043, + "reward_std": 0.2367071658372879, + "rewards/progression_diversity/mean": -0.1125800758600235, + "rewards/progression_diversity/std": 0.15995118021965027, + "rewards/symbolic_reward_accuracy/mean": 0.2890625, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.5712727904319763, + "rewards/symbolic_reward_partial_score/std": 0.4000099301338196, + "rewards/tag_count_reward/mean": -0.349609375, + "rewards/tag_count_reward/std": 0.47731292247772217, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840480089187622, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 428.0, + "sampling/sampling_logp_difference/mean": 12.845325469970703, + "step": 53 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.15644784271717072, + "epoch": 0.08653846153846154, + "grad_norm": 271.4656677246094, + "learning_rate": 1e-06, + "loss": 0.2644, + "step": 54 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.1287785843014717, + "epoch": 0.08814102564102565, + "grad_norm": 238.35011291503906, + "learning_rate": 1e-06, + "loss": 0.2856, + "step": 55 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.15913260728120804, + "epoch": 0.08974358974358974, + "grad_norm": 207.0217742919922, + "learning_rate": 1e-06, + "loss": 0.3107, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.44140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6438.0, + "completions/mean_length": 8034.2265625, + "completions/mean_terminated_length": 1436.15380859375, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.14069371670484543, + "epoch": 0.09134615384615384, + "frac_reward_zero_std": 0.0, + "grad_norm": 261.5517578125, + "learning_rate": 1e-06, + "loss": 0.2341, + "num_tokens": 71809458.0, + "reward": 0.28543901443481445, + "reward_std": 0.24921724200248718, + "rewards/progression_diversity/mean": -0.13432222604751587, + "rewards/progression_diversity/std": 0.17075546085834503, + "rewards/symbolic_reward_accuracy/mean": 0.2734375, + "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, + "rewards/symbolic_reward_partial_score/mean": 0.5334147214889526, + "rewards/symbolic_reward_partial_score/std": 0.41190940141677856, + "rewards/tag_count_reward/mean": -0.373046875, + "rewards/tag_count_reward/std": 0.48408737778663635, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9736584424972534, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 432.0, + "sampling/sampling_logp_difference/mean": 14.813698768615723, + "step": 57 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.1330208107829094, + "epoch": 0.09294871794871795, + "grad_norm": 234.9654541015625, + "learning_rate": 1e-06, + "loss": 0.2835, + "step": 58 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.4140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.13570880889892578, + "epoch": 0.09455128205128205, + "grad_norm": 195.11659240722656, + "learning_rate": 1e-06, + "loss": 0.2919, + "step": 59 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.453125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.12034280598163605, + "epoch": 0.09615384615384616, + "grad_norm": 103.8945083618164, + "learning_rate": 1e-06, + "loss": 0.3713, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16376.0, + "completions/mean_length": 7816.056640625, + "completions/mean_terminated_length": 1563.773681640625, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.1452302411198616, + "epoch": 0.09775641025641026, + "frac_reward_zero_std": 0.03125, + "grad_norm": 524.85205078125, + "learning_rate": 1e-06, + "loss": 0.203, + "num_tokens": 76649167.0, + "reward": 0.2318515032529831, + "reward_std": 0.18006575107574463, + "rewards/progression_diversity/mean": -0.13223427534103394, + "rewards/progression_diversity/std": 0.17053887248039246, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.5187825560569763, + "rewards/symbolic_reward_partial_score/std": 0.3892955183982849, + "rewards/tag_count_reward/mean": -0.361328125, + "rewards/tag_count_reward/std": 0.48085519671440125, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811015129089355, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 434.0, + "sampling/sampling_logp_difference/mean": 13.738810539245605, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.1534811109304428, + "epoch": 0.09935897435897435, + "grad_norm": 12.576883316040039, + "learning_rate": 1e-06, + "loss": 0.276, + "step": 62 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.14197808876633644, + "epoch": 0.10096153846153846, + "grad_norm": 69.41390991210938, + "learning_rate": 1e-06, + "loss": 0.3, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.15332899242639542, + "epoch": 0.10256410256410256, + "grad_norm": 46.41984939575195, + "learning_rate": 1e-06, + "loss": 0.3122, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4021.0, + "completions/mean_length": 7709.984375, + "completions/mean_terminated_length": 1380.29736328125, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "entropy": 0.11576578766107559, + "epoch": 0.10416666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 849.912353515625, + "learning_rate": 1e-06, + "loss": 0.3962, + "num_tokens": 81463175.0, + "reward": 0.2805628180503845, + "reward_std": 0.22776535153388977, + "rewards/progression_diversity/mean": -0.13366013765335083, + "rewards/progression_diversity/std": 0.1738085299730301, + "rewards/symbolic_reward_accuracy/mean": 0.267578125, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.530810534954071, + "rewards/symbolic_reward_partial_score/std": 0.40938085317611694, + "rewards/tag_count_reward/mean": -0.37890625, + "rewards/tag_count_reward/std": 0.4855891764163971, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9786410331726074, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 436.0, + "sampling/sampling_logp_difference/mean": 14.30560302734375, + "step": 65 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.15666405856609344, + "epoch": 0.10576923076923077, + "grad_norm": 67.19806671142578, + "learning_rate": 1e-06, + "loss": 0.2378, + "step": 66 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.14105742424726486, + "epoch": 0.10737179487179487, + "grad_norm": 82.8458023071289, + "learning_rate": 1e-06, + "loss": 0.3127, + "step": 67 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.14606218039989471, + "epoch": 0.10897435897435898, + "grad_norm": 113.2381362915039, + "learning_rate": 1e-06, + "loss": 0.2326, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5696.0, + "completions/mean_length": 7631.79296875, + "completions/mean_terminated_length": 1446.9000244140625, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.15287064760923386, + "epoch": 0.11057692307692307, + "frac_reward_zero_std": 0.0625, + "grad_norm": 262.7812194824219, + "learning_rate": 1e-06, + "loss": 0.2887, + "num_tokens": 86220381.0, + "reward": 0.29364585876464844, + "reward_std": 0.19495844841003418, + "rewards/progression_diversity/mean": -0.13199639320373535, + "rewards/progression_diversity/std": 0.17195309698581696, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.5769693851470947, + "rewards/symbolic_reward_partial_score/std": 0.39537864923477173, + "rewards/tag_count_reward/mean": -0.375, + "rewards/tag_count_reward/std": 0.4845963716506958, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9830174446105957, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 436.0, + "sampling/sampling_logp_difference/mean": 13.659445762634277, + "step": 69 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.1318165846168995, + "epoch": 0.11217948717948718, + "grad_norm": 550.3621826171875, + "learning_rate": 1e-06, + "loss": 0.3745, + "step": 70 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.1556747555732727, + "epoch": 0.11378205128205128, + "grad_norm": 144.26145935058594, + "learning_rate": 1e-06, + "loss": 0.2036, + "step": 71 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.3671875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.1447812095284462, + "epoch": 0.11538461538461539, + "grad_norm": 35.39904022216797, + "learning_rate": 1e-06, + "loss": 0.2419, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.431640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8566.0, + "completions/mean_length": 7925.490234375, + "completions/mean_terminated_length": 1501.673583984375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "entropy": 0.1336100473999977, + "epoch": 0.11698717948717949, + "frac_reward_zero_std": 0.0, + "grad_norm": 890.0448608398438, + "learning_rate": 1e-06, + "loss": 0.3014, + "num_tokens": 91139272.0, + "reward": 0.23745985329151154, + "reward_std": 0.2408495545387268, + "rewards/progression_diversity/mean": -0.14415252208709717, + "rewards/progression_diversity/std": 0.18349310755729675, + "rewards/symbolic_reward_accuracy/mean": 0.1953125, + "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, + "rewards/symbolic_reward_partial_score/mean": 0.5307128429412842, + "rewards/symbolic_reward_partial_score/std": 0.38882943987846375, + "rewards/tag_count_reward/mean": -0.375, + "rewards/tag_count_reward/std": 0.4845963716506958, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9764099717140198, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 438.0, + "sampling/sampling_logp_difference/mean": 14.951321601867676, + "step": 73 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.13262945413589478, + "epoch": 0.11858974358974358, + "grad_norm": 236.77122497558594, + "learning_rate": 1e-06, + "loss": 0.3062, + "step": 74 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.4375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6015625, + "entropy": 0.12846176326274872, + "epoch": 0.1201923076923077, + "grad_norm": 424.5876159667969, + "learning_rate": 1e-06, + "loss": 0.3248, + "step": 75 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.1426731012761593, + "epoch": 0.12179487179487179, + "grad_norm": 137.90576171875, + "learning_rate": 1e-06, + "loss": 0.2592, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.400390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6630.0, + "completions/mean_length": 7431.486328125, + "completions/mean_terminated_length": 1453.4234619140625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.1482650637626648, + "epoch": 0.1233974358974359, + "frac_reward_zero_std": 0.0, + "grad_norm": 503.6921691894531, + "learning_rate": 1e-06, + "loss": 0.309, + "num_tokens": 95835905.0, + "reward": 0.31015902757644653, + "reward_std": 0.23790916800498962, + "rewards/progression_diversity/mean": -0.12814107537269592, + "rewards/progression_diversity/std": 0.17228002846240997, + "rewards/symbolic_reward_accuracy/mean": 0.294921875, + "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, + "rewards/symbolic_reward_partial_score/mean": 0.5648274421691895, + "rewards/symbolic_reward_partial_score/std": 0.4122779667377472, + "rewards/tag_count_reward/mean": -0.349609375, + "rewards/tag_count_reward/std": 0.47731292247772217, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9831830263137817, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 440.0, + "sampling/sampling_logp_difference/mean": 13.701435089111328, + "step": 77 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.15608947724103928, + "epoch": 0.125, + "grad_norm": 502.0856628417969, + "learning_rate": 1e-06, + "loss": 0.2134, + "step": 78 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.1563669890165329, + "epoch": 0.1266025641025641, + "grad_norm": 40.49258804321289, + "learning_rate": 1e-06, + "loss": 0.2021, + "step": 79 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6640625, + "entropy": 0.13854920864105225, + "epoch": 0.1282051282051282, + "grad_norm": 24.90146255493164, + "learning_rate": 1e-06, + "loss": 0.3625, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.376953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8946.0, + "completions/mean_length": 7095.765625, + "completions/mean_terminated_length": 1476.2381591796875, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "entropy": 0.1516287624835968, + "epoch": 0.12980769230769232, + "frac_reward_zero_std": 0.0, + "grad_norm": 1240.288818359375, + "learning_rate": 1e-06, + "loss": 0.3117, + "num_tokens": 100263337.0, + "reward": 0.2255856692790985, + "reward_std": 0.19480521976947784, + "rewards/progression_diversity/mean": -0.12746772170066833, + "rewards/progression_diversity/std": 0.17732733488082886, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.5641438961029053, + "rewards/symbolic_reward_partial_score/std": 0.36627063155174255, + "rewards/tag_count_reward/mean": -0.349609375, + "rewards/tag_count_reward/std": 0.47731292247772217, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782713651657104, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 440.0, + "sampling/sampling_logp_difference/mean": 14.690707206726074, + "step": 81 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.17118209600448608, + "epoch": 0.13141025641025642, + "grad_norm": 143.5312042236328, + "learning_rate": 1e-06, + "loss": 0.2574, + "step": 82 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.14265629649162292, + "epoch": 0.1330128205128205, + "grad_norm": 2867.10009765625, + "learning_rate": 1e-06, + "loss": 0.4231, + "step": 83 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.15464875847101212, + "epoch": 0.1346153846153846, + "grad_norm": 54.51691436767578, + "learning_rate": 1e-06, + "loss": 0.2703, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.48046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3804.0, + "completions/mean_length": 8625.69921875, + "completions/mean_terminated_length": 1450.7293701171875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.13001390919089317, + "epoch": 0.1362179487179487, + "frac_reward_zero_std": 0.0, + "grad_norm": 455.9178161621094, + "learning_rate": 1e-06, + "loss": 0.222, + "num_tokens": 105565695.0, + "reward": 0.1887899488210678, + "reward_std": 0.2059934437274933, + "rewards/progression_diversity/mean": -0.16299909353256226, + "rewards/progression_diversity/std": 0.18564143776893616, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.47913411259651184, + "rewards/symbolic_reward_partial_score/std": 0.38510748744010925, + "rewards/tag_count_reward/mean": -0.447265625, + "rewards/tag_count_reward/std": 0.4976975917816162, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9722630977630615, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 442.0, + "sampling/sampling_logp_difference/mean": 15.671956062316895, + "step": 85 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.1201215460896492, + "epoch": 0.13782051282051283, + "grad_norm": 307.8591003417969, + "learning_rate": 1e-06, + "loss": 0.3693, + "step": 86 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.12688108533620834, + "epoch": 0.13942307692307693, + "grad_norm": 181.98202514648438, + "learning_rate": 1e-06, + "loss": 0.3682, + "step": 87 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.4140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.12507511675357819, + "epoch": 0.14102564102564102, + "grad_norm": 60.04415512084961, + "learning_rate": 1e-06, + "loss": 0.3179, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4394.0, + "completions/mean_length": 7166.556640625, + "completions/mean_terminated_length": 1449.4083251953125, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.16381006687879562, + "epoch": 0.14262820512820512, + "frac_reward_zero_std": 0.0, + "grad_norm": 344.05682373046875, + "learning_rate": 1e-06, + "loss": 0.2595, + "num_tokens": 109989372.0, + "reward": 0.28582632541656494, + "reward_std": 0.21509969234466553, + "rewards/progression_diversity/mean": -0.12635371088981628, + "rewards/progression_diversity/std": 0.17559568583965302, + "rewards/symbolic_reward_accuracy/mean": 0.255859375, + "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, + "rewards/symbolic_reward_partial_score/mean": 0.5617838501930237, + "rewards/symbolic_reward_partial_score/std": 0.3981553614139557, + "rewards/tag_count_reward/mean": -0.349609375, + "rewards/tag_count_reward/std": 0.47731292247772217, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799978733062744, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 444.0, + "sampling/sampling_logp_difference/mean": 14.372432708740234, + "step": 89 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.14315053820610046, + "epoch": 0.14423076923076922, + "grad_norm": 354.8428955078125, + "learning_rate": 1e-06, + "loss": 0.3071, + "step": 90 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.16275375336408615, + "epoch": 0.14583333333333334, + "grad_norm": 645.2577514648438, + "learning_rate": 1e-06, + "loss": 0.3115, + "step": 91 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.16187621653079987, + "epoch": 0.14743589743589744, + "grad_norm": 0.025196930393576622, + "learning_rate": 1e-06, + "loss": 0.2797, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.333984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4512.0, + "completions/mean_length": 6437.927734375, + "completions/mean_terminated_length": 1450.307861328125, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.18978476524353027, + "epoch": 0.14903846153846154, + "frac_reward_zero_std": 0.03125, + "grad_norm": 520.9977416992188, + "learning_rate": 1e-06, + "loss": 0.1873, + "num_tokens": 114084839.0, + "reward": 0.3511761724948883, + "reward_std": 0.209753155708313, + "rewards/progression_diversity/mean": -0.11236479878425598, + "rewards/progression_diversity/std": 0.17498160898685455, + "rewards/symbolic_reward_accuracy/mean": 0.326171875, + "rewards/symbolic_reward_accuracy/std": 0.4692695140838623, + "rewards/symbolic_reward_partial_score/mean": 0.6183431148529053, + "rewards/symbolic_reward_partial_score/std": 0.38944968581199646, + "rewards/tag_count_reward/mean": -0.2890625, + "rewards/tag_count_reward/std": 0.45377036929130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9891281127929688, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 444.0, + "sampling/sampling_logp_difference/mean": 12.81410026550293, + "step": 93 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.16329990327358246, + "epoch": 0.15064102564102563, + "grad_norm": 115.51496887207031, + "learning_rate": 1e-06, + "loss": 0.305, + "step": 94 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.17047469317913055, + "epoch": 0.15224358974358973, + "grad_norm": 48.81769561767578, + "learning_rate": 1e-06, + "loss": 0.235, + "step": 95 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6328125, + "entropy": 0.16202182322740555, + "epoch": 0.15384615384615385, + "grad_norm": 21.413990020751953, + "learning_rate": 1e-06, + "loss": 0.2707, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.45703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4237.0, + "completions/mean_length": 8286.236328125, + "completions/mean_terminated_length": 1470.1331787109375, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.12126778811216354, + "epoch": 0.15544871794871795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1298.6553955078125, + "learning_rate": 1e-06, + "loss": 0.3518, + "num_tokens": 119384688.0, + "reward": 0.15234431624412537, + "reward_std": 0.18723168969154358, + "rewards/progression_diversity/mean": -0.1576594114303589, + "rewards/progression_diversity/std": 0.18727099895477295, + "rewards/symbolic_reward_accuracy/mean": 0.099609375, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.44340819120407104, + "rewards/symbolic_reward_partial_score/std": 0.3761260211467743, + "rewards/tag_count_reward/mean": -0.388671875, + "rewards/tag_count_reward/std": 0.4879252314567566, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9738816022872925, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 446.0, + "sampling/sampling_logp_difference/mean": 15.43978500366211, + "step": 97 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.1406988874077797, + "epoch": 0.15705128205128205, + "grad_norm": 550.3086547851562, + "learning_rate": 1e-06, + "loss": 0.2552, + "step": 98 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6328125, + "entropy": 0.13929974660277367, + "epoch": 0.15865384615384615, + "grad_norm": 161.8490447998047, + "learning_rate": 1e-06, + "loss": 0.2264, + "step": 99 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.4609375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.71875, + "entropy": 0.11794158071279526, + "epoch": 0.16025641025641027, + "grad_norm": 22.501441955566406, + "learning_rate": 1e-06, + "loss": 0.4061, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.392578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8333.0, + "completions/mean_length": 7378.84375, + "completions/mean_terminated_length": 1558.791015625, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 0.14221202209591866, + "epoch": 0.16185897435897437, + "frac_reward_zero_std": 0.0, + "grad_norm": 3660.10693359375, + "learning_rate": 1e-06, + "loss": 0.409, + "num_tokens": 124050112.0, + "reward": 0.18008412420749664, + "reward_std": 0.16677263379096985, + "rewards/progression_diversity/mean": -0.14051449298858643, + "rewards/progression_diversity/std": 0.18938302993774414, + "rewards/symbolic_reward_accuracy/mean": 0.115234375, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.4884277284145355, + "rewards/symbolic_reward_partial_score/std": 0.37571844458580017, + "rewards/tag_count_reward/mean": -0.341796875, + "rewards/tag_count_reward/std": 0.4747757613658905, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.976874589920044, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 448.0, + "sampling/sampling_logp_difference/mean": 15.273395538330078, + "step": 101 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.1293311044573784, + "epoch": 0.16346153846153846, + "grad_norm": 400.09881591796875, + "learning_rate": 1e-06, + "loss": 0.3765, + "step": 102 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.1685006469488144, + "epoch": 0.16506410256410256, + "grad_norm": 77.7186050415039, + "learning_rate": 1e-06, + "loss": 0.2434, + "step": 103 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6796875, + "entropy": 0.16083669662475586, + "epoch": 0.16666666666666666, + "grad_norm": 67.05364990234375, + "learning_rate": 1e-06, + "loss": 0.2542, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.330078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8690.0, + "completions/mean_length": 6384.462890625, + "completions/mean_terminated_length": 1457.5772705078125, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "entropy": 0.17285646498203278, + "epoch": 0.16826923076923078, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1558.9949951171875, + "learning_rate": 1e-06, + "loss": 0.1558, + "num_tokens": 128130573.0, + "reward": 0.2611575722694397, + "reward_std": 0.21417993307113647, + "rewards/progression_diversity/mean": -0.116178959608078, + "rewards/progression_diversity/std": 0.17685534060001373, + "rewards/symbolic_reward_accuracy/mean": 0.197265625, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.5794758796691895, + "rewards/symbolic_reward_partial_score/std": 0.3689756989479065, + "rewards/tag_count_reward/mean": -0.298828125, + "rewards/tag_count_reward/std": 0.45819199085235596, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9883815050125122, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 452.0, + "sampling/sampling_logp_difference/mean": 13.416872024536133, + "step": 105 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.1663920134305954, + "epoch": 0.16987179487179488, + "grad_norm": 319.74737548828125, + "learning_rate": 1e-06, + "loss": 0.2595, + "step": 106 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.18017150461673737, + "epoch": 0.17147435897435898, + "grad_norm": 447.3372497558594, + "learning_rate": 1e-06, + "loss": 0.313, + "step": 107 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.182367742061615, + "epoch": 0.17307692307692307, + "grad_norm": 80.31652069091797, + "learning_rate": 1e-06, + "loss": 0.2265, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.400390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5179.0, + "completions/mean_length": 7449.263671875, + "completions/mean_terminated_length": 1483.0716552734375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.1524367406964302, + "epoch": 0.17467948717948717, + "frac_reward_zero_std": 0.03125, + "grad_norm": 3725.33349609375, + "learning_rate": 1e-06, + "loss": 0.3077, + "num_tokens": 132834852.0, + "reward": 0.24695852398872375, + "reward_std": 0.17919716238975525, + "rewards/progression_diversity/mean": -0.14106225967407227, + "rewards/progression_diversity/std": 0.19025002419948578, + "rewards/symbolic_reward_accuracy/mean": 0.20703125, + "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, + "rewards/symbolic_reward_partial_score/mean": 0.5316731333732605, + "rewards/symbolic_reward_partial_score/std": 0.3761518597602844, + "rewards/tag_count_reward/mean": -0.353515625, + "rewards/tag_count_reward/std": 0.47852855920791626, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9842573404312134, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 456.0, + "sampling/sampling_logp_difference/mean": 14.218891143798828, + "step": 109 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.14695103839039803, + "epoch": 0.1762820512820513, + "grad_norm": 219.70684814453125, + "learning_rate": 1e-06, + "loss": 0.2567, + "step": 110 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.16737890988588333, + "epoch": 0.1778846153846154, + "grad_norm": 367.99285888671875, + "learning_rate": 1e-06, + "loss": 0.2145, + "step": 111 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.1623668521642685, + "epoch": 0.1794871794871795, + "grad_norm": 359.1845703125, + "learning_rate": 1e-06, + "loss": 0.3114, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.298828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5083.0, + "completions/mean_length": 5895.5625, + "completions/mean_terminated_length": 1425.559814453125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.17023801803588867, + "epoch": 0.18108974358974358, + "frac_reward_zero_std": 0.03125, + "grad_norm": 908.6353149414062, + "learning_rate": 1e-06, + "loss": 0.2684, + "num_tokens": 136701844.0, + "reward": 0.3186452388763428, + "reward_std": 0.20103424787521362, + "rewards/progression_diversity/mean": -0.1164332926273346, + "rewards/progression_diversity/std": 0.18981561064720154, + "rewards/symbolic_reward_accuracy/mean": 0.27734375, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.5979329347610474, + "rewards/symbolic_reward_partial_score/std": 0.38331446051597595, + "rewards/tag_count_reward/mean": -0.259765625, + "rewards/tag_count_reward/std": 0.4389347732067108, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9887030124664307, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 456.0, + "sampling/sampling_logp_difference/mean": 13.705371856689453, + "step": 113 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.19520440697669983, + "epoch": 0.18269230769230768, + "grad_norm": 1410.1068115234375, + "learning_rate": 1e-06, + "loss": 0.2648, + "step": 114 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.20632527768611908, + "epoch": 0.1842948717948718, + "grad_norm": 189.16241455078125, + "learning_rate": 1e-06, + "loss": 0.2618, + "step": 115 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.18539946526288986, + "epoch": 0.1858974358974359, + "grad_norm": 1.3296606540679932, + "learning_rate": 1e-06, + "loss": 0.2421, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13409.0, + "completions/mean_length": 6352.482421875, + "completions/mean_terminated_length": 1453.369140625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "entropy": 0.17278361320495605, + "epoch": 0.1875, + "frac_reward_zero_std": 0.03125, + "grad_norm": 269.99188232421875, + "learning_rate": 1e-06, + "loss": 0.2609, + "num_tokens": 140781531.0, + "reward": 0.28726375102996826, + "reward_std": 0.20572300255298615, + "rewards/progression_diversity/mean": -0.1266537606716156, + "rewards/progression_diversity/std": 0.19276343286037445, + "rewards/symbolic_reward_accuracy/mean": 0.236328125, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.5861165523529053, + "rewards/symbolic_reward_partial_score/std": 0.37256380915641785, + "rewards/tag_count_reward/mean": -0.291015625, + "rewards/tag_count_reward/std": 0.45467492938041687, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9863637089729309, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 460.0, + "sampling/sampling_logp_difference/mean": 14.210054397583008, + "step": 117 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.1698644682765007, + "epoch": 0.1891025641025641, + "grad_norm": 174.22406005859375, + "learning_rate": 1e-06, + "loss": 0.2695, + "step": 118 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.17513630539178848, + "epoch": 0.1907051282051282, + "grad_norm": 80.73851013183594, + "learning_rate": 1e-06, + "loss": 0.2233, + "step": 119 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.1907574087381363, + "epoch": 0.19230769230769232, + "grad_norm": 10.501818656921387, + "learning_rate": 1e-06, + "loss": 0.2534, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12543.0, + "completions/mean_length": 5693.0859375, + "completions/mean_terminated_length": 1428.404296875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.20927876979112625, + "epoch": 0.19391025641025642, + "frac_reward_zero_std": 0.0, + "grad_norm": 1835.8458251953125, + "learning_rate": 1e-06, + "loss": 0.2083, + "num_tokens": 144458791.0, + "reward": 0.3568437099456787, + "reward_std": 0.23056712746620178, + "rewards/progression_diversity/mean": -0.11690312623977661, + "rewards/progression_diversity/std": 0.19476282596588135, + "rewards/symbolic_reward_accuracy/mean": 0.326171875, + "rewards/symbolic_reward_accuracy/std": 0.4692695140838623, + "rewards/symbolic_reward_partial_score/mean": 0.6289225220680237, + "rewards/symbolic_reward_partial_score/std": 0.3799564838409424, + "rewards/tag_count_reward/mean": -0.263671875, + "rewards/tag_count_reward/std": 0.4410543739795685, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867503643035889, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 462.0, + "sampling/sampling_logp_difference/mean": 14.30959415435791, + "step": 121 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.19147824496030807, + "epoch": 0.1955128205128205, + "grad_norm": 149.43600463867188, + "learning_rate": 1e-06, + "loss": 0.2888, + "step": 122 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.19553864002227783, + "epoch": 0.1971153846153846, + "grad_norm": 161.30584716796875, + "learning_rate": 1e-06, + "loss": 0.2042, + "step": 123 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.1698673516511917, + "epoch": 0.1987179487179487, + "grad_norm": 62.17693328857422, + "learning_rate": 1e-06, + "loss": 0.3143, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.349609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5310.0, + "completions/mean_length": 6637.630859375, + "completions/mean_terminated_length": 1398.5916748046875, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.15772393345832825, + "epoch": 0.20032051282051283, + "frac_reward_zero_std": 0.0, + "grad_norm": 361.6252136230469, + "learning_rate": 1e-06, + "loss": 0.2752, + "num_tokens": 148787658.0, + "reward": 0.2472916841506958, + "reward_std": 0.20401528477668762, + "rewards/progression_diversity/mean": -0.13557906448841095, + "rewards/progression_diversity/std": 0.1981465220451355, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.5566895008087158, + "rewards/symbolic_reward_partial_score/std": 0.3679920732975006, + "rewards/tag_count_reward/mean": -0.30859375, + "rewards/tag_count_reward/std": 0.4623647928237915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800022840499878, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 462.0, + "sampling/sampling_logp_difference/mean": 15.337078094482422, + "step": 125 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.17300068587064743, + "epoch": 0.20192307692307693, + "grad_norm": 195.64173889160156, + "learning_rate": 1e-06, + "loss": 0.2278, + "step": 126 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.14945074170827866, + "epoch": 0.20352564102564102, + "grad_norm": 388.9164733886719, + "learning_rate": 1e-06, + "loss": 0.3515, + "step": 127 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.3515625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.640625, + "entropy": 0.1672022044658661, + "epoch": 0.20512820512820512, + "grad_norm": 10.722322463989258, + "learning_rate": 1e-06, + "loss": 0.2494, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.341796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4674.0, + "completions/mean_length": 6536.28125, + "completions/mean_terminated_length": 1422.480712890625, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "entropy": 0.16759486496448517, + "epoch": 0.20673076923076922, + "frac_reward_zero_std": 0.03125, + "grad_norm": 282.8157043457031, + "learning_rate": 1e-06, + "loss": 0.2464, + "num_tokens": 153045402.0, + "reward": 0.29630646109580994, + "reward_std": 0.23296579718589783, + "rewards/progression_diversity/mean": -0.1359575092792511, + "rewards/progression_diversity/std": 0.20307843387126923, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.555371105670929, + "rewards/symbolic_reward_partial_score/std": 0.40097343921661377, + "rewards/tag_count_reward/mean": -0.283203125, + "rewards/tag_count_reward/std": 0.4509948492050171, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9835497140884399, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 464.0, + "sampling/sampling_logp_difference/mean": 14.896586418151855, + "step": 129 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6328125, + "entropy": 0.1613958775997162, + "epoch": 0.20833333333333334, + "grad_norm": 273.4793701171875, + "learning_rate": 1e-06, + "loss": 0.2878, + "step": 130 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.17489013820886612, + "epoch": 0.20993589743589744, + "grad_norm": 54.437557220458984, + "learning_rate": 1e-06, + "loss": 0.2721, + "step": 131 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.16963838040828705, + "epoch": 0.21153846153846154, + "grad_norm": 174.172119140625, + "learning_rate": 1e-06, + "loss": 0.2341, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.365234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14442.0, + "completions/mean_length": 7063.861328125, + "completions/mean_terminated_length": 1701.1968994140625, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.14898524433374405, + "epoch": 0.21314102564102563, + "frac_reward_zero_std": 0.0, + "grad_norm": 1326.01611328125, + "learning_rate": 1e-06, + "loss": 0.3208, + "num_tokens": 157607635.0, + "reward": 0.22771768271923065, + "reward_std": 0.22894077003002167, + "rewards/progression_diversity/mean": -0.14180660247802734, + "rewards/progression_diversity/std": 0.19994327425956726, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.5255045294761658, + "rewards/symbolic_reward_partial_score/std": 0.3672705292701721, + "rewards/tag_count_reward/mean": -0.328125, + "rewards/tag_count_reward/std": 0.4699897766113281, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9785855412483215, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 464.0, + "sampling/sampling_logp_difference/mean": 15.50029182434082, + "step": 133 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.14145556092262268, + "epoch": 0.21474358974358973, + "grad_norm": 567.6675415039062, + "learning_rate": 1e-06, + "loss": 0.3571, + "step": 134 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.15257297456264496, + "epoch": 0.21634615384615385, + "grad_norm": 3.565154552459717, + "learning_rate": 1e-06, + "loss": 0.2647, + "step": 135 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.16691310703754425, + "epoch": 0.21794871794871795, + "grad_norm": 91.63186645507812, + "learning_rate": 1e-06, + "loss": 0.2332, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.349609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3840.0, + "completions/mean_length": 6634.4609375, + "completions/mean_terminated_length": 1393.7177734375, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.16496124863624573, + "epoch": 0.21955128205128205, + "frac_reward_zero_std": 0.03125, + "grad_norm": 526.1981201171875, + "learning_rate": 1e-06, + "loss": 0.289, + "num_tokens": 161836447.0, + "reward": 0.20310071110725403, + "reward_std": 0.19233137369155884, + "rewards/progression_diversity/mean": -0.14451980590820312, + "rewards/progression_diversity/std": 0.2100125253200531, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.5359863042831421, + "rewards/symbolic_reward_partial_score/std": 0.3669736087322235, + "rewards/tag_count_reward/mean": -0.3125, + "rewards/tag_count_reward/std": 0.4639657139778137, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9792190790176392, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 468.0, + "sampling/sampling_logp_difference/mean": 15.816644668579102, + "step": 137 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.16982516646385193, + "epoch": 0.22115384615384615, + "grad_norm": 759.0693359375, + "learning_rate": 1e-06, + "loss": 0.4166, + "step": 138 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6015625, + "entropy": 0.16247816383838654, + "epoch": 0.22275641025641027, + "grad_norm": 522.9348754882812, + "learning_rate": 1e-06, + "loss": 0.2726, + "step": 139 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.640625, + "entropy": 0.1550752818584442, + "epoch": 0.22435897435897437, + "grad_norm": 4.066892147064209, + "learning_rate": 1e-06, + "loss": 0.3117, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.29296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7450.0, + "completions/mean_length": 5863.76171875, + "completions/mean_terminated_length": 1504.5469970703125, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 0.1805713176727295, + "epoch": 0.22596153846153846, + "frac_reward_zero_std": 0.03125, + "grad_norm": 949.1548461914062, + "learning_rate": 1e-06, + "loss": 0.2557, + "num_tokens": 165682309.0, + "reward": 0.20798048377037048, + "reward_std": 0.1601024717092514, + "rewards/progression_diversity/mean": -0.11650343239307404, + "rewards/progression_diversity/std": 0.19056853652000427, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.5506673455238342, + "rewards/symbolic_reward_partial_score/std": 0.35328954458236694, + "rewards/tag_count_reward/mean": -0.263671875, + "rewards/tag_count_reward/std": 0.4410543739795685, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906647205352783, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 468.0, + "sampling/sampling_logp_difference/mean": 13.65820598602295, + "step": 141 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.19785359501838684, + "epoch": 0.22756410256410256, + "grad_norm": 413.5582580566406, + "learning_rate": 1e-06, + "loss": 0.3065, + "step": 142 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.625, + "entropy": 0.1836673691868782, + "epoch": 0.22916666666666666, + "grad_norm": 538.4183959960938, + "learning_rate": 1e-06, + "loss": 0.2451, + "step": 143 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.19951891899108887, + "epoch": 0.23076923076923078, + "grad_norm": 140.01963806152344, + "learning_rate": 1e-06, + "loss": 0.2766, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13188.0, + "completions/mean_length": 6076.03515625, + "completions/mean_terminated_length": 1559.061767578125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.18072357773780823, + "epoch": 0.23237179487179488, + "frac_reward_zero_std": 0.0625, + "grad_norm": 477.57244873046875, + "learning_rate": 1e-06, + "loss": 0.2059, + "num_tokens": 169643863.0, + "reward": 0.2985851764678955, + "reward_std": 0.21802590787410736, + "rewards/progression_diversity/mean": -0.12000055611133575, + "rewards/progression_diversity/std": 0.19343887269496918, + "rewards/symbolic_reward_accuracy/mean": 0.248046875, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.5884765386581421, + "rewards/symbolic_reward_partial_score/std": 0.3711862862110138, + "rewards/tag_count_reward/mean": -0.255859375, + "rewards/tag_count_reward/std": 0.43676990270614624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889644384384155, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 468.0, + "sampling/sampling_logp_difference/mean": 13.810128211975098, + "step": 145 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.16814905405044556, + "epoch": 0.23397435897435898, + "grad_norm": 629.6782836914062, + "learning_rate": 1e-06, + "loss": 0.3065, + "step": 146 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.17732855677604675, + "epoch": 0.23557692307692307, + "grad_norm": 60.79619216918945, + "learning_rate": 1e-06, + "loss": 0.2682, + "step": 147 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.18027547001838684, + "epoch": 0.23717948717948717, + "grad_norm": 11.454753875732422, + "learning_rate": 1e-06, + "loss": 0.2156, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.291015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8313.0, + "completions/mean_length": 5886.9921875, + "completions/mean_terminated_length": 1578.3031005859375, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.20559810101985931, + "epoch": 0.2387820512820513, + "frac_reward_zero_std": 0.0, + "grad_norm": 932.6332397460938, + "learning_rate": 1e-06, + "loss": 0.1883, + "num_tokens": 173516387.0, + "reward": 0.26607826352119446, + "reward_std": 0.2007283866405487, + "rewards/progression_diversity/mean": -0.11971336603164673, + "rewards/progression_diversity/std": 0.19882185757160187, + "rewards/symbolic_reward_accuracy/mean": 0.201171875, + "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, + "rewards/symbolic_reward_partial_score/mean": 0.5738606452941895, + "rewards/symbolic_reward_partial_score/std": 0.3670341968536377, + "rewards/tag_count_reward/mean": -0.255859375, + "rewards/tag_count_reward/std": 0.43676990270614624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9897212386131287, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 470.0, + "sampling/sampling_logp_difference/mean": 13.849344253540039, + "step": 149 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.18233749270439148, + "epoch": 0.2403846153846154, + "grad_norm": 219.96221923828125, + "learning_rate": 1e-06, + "loss": 0.3033, + "step": 150 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.17845363169908524, + "epoch": 0.2419871794871795, + "grad_norm": 243.94744873046875, + "learning_rate": 1e-06, + "loss": 0.2881, + "step": 151 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.17516274005174637, + "epoch": 0.24358974358974358, + "grad_norm": 35.456787109375, + "learning_rate": 1e-06, + "loss": 0.2953, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.302734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5891.0, + "completions/mean_length": 5964.396484375, + "completions/mean_terminated_length": 1440.47900390625, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.17081477493047714, + "epoch": 0.24519230769230768, + "frac_reward_zero_std": 0.0625, + "grad_norm": 492.8033752441406, + "learning_rate": 1e-06, + "loss": 0.3071, + "num_tokens": 177494126.0, + "reward": 0.2438780814409256, + "reward_std": 0.16874653100967407, + "rewards/progression_diversity/mean": -0.12586447596549988, + "rewards/progression_diversity/std": 0.202118381857872, + "rewards/symbolic_reward_accuracy/mean": 0.169921875, + "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, + "rewards/symbolic_reward_partial_score/mean": 0.5612630844116211, + "rewards/symbolic_reward_partial_score/std": 0.35993054509162903, + "rewards/tag_count_reward/mean": -0.251953125, + "rewards/tag_count_reward/std": 0.43455907702445984, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9910095930099487, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 472.0, + "sampling/sampling_logp_difference/mean": 13.843090057373047, + "step": 153 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.19071951508522034, + "epoch": 0.2467948717948718, + "grad_norm": 417.1937561035156, + "learning_rate": 1e-06, + "loss": 0.2772, + "step": 154 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.18628785014152527, + "epoch": 0.2483974358974359, + "grad_norm": 27.67140007019043, + "learning_rate": 1e-06, + "loss": 0.2686, + "step": 155 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.2085033506155014, + "epoch": 0.25, + "grad_norm": 0.0168951116502285, + "learning_rate": 1e-06, + "loss": 0.1791, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.255859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5164.0, + "completions/mean_length": 5305.685546875, + "completions/mean_terminated_length": 1496.6063232421875, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.18942154943943024, + "epoch": 0.2516025641025641, + "frac_reward_zero_std": 0.03125, + "grad_norm": 547.8964233398438, + "learning_rate": 1e-06, + "loss": 0.2312, + "num_tokens": 181116349.0, + "reward": 0.27441728115081787, + "reward_std": 0.20713432133197784, + "rewards/progression_diversity/mean": -0.10222004354000092, + "rewards/progression_diversity/std": 0.18552517890930176, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.6082357168197632, + "rewards/symbolic_reward_partial_score/std": 0.34247255325317383, + "rewards/tag_count_reward/mean": -0.20703125, + "rewards/tag_count_reward/std": 0.40557438135147095, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906232357025146, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 472.0, + "sampling/sampling_logp_difference/mean": 13.893712043762207, + "step": 157 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.20169401168823242, + "epoch": 0.2532051282051282, + "grad_norm": 378.7434997558594, + "learning_rate": 1e-06, + "loss": 0.3347, + "step": 158 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.20617859065532684, + "epoch": 0.2548076923076923, + "grad_norm": 51.51784133911133, + "learning_rate": 1e-06, + "loss": 0.2684, + "step": 159 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.20408597588539124, + "epoch": 0.2564102564102564, + "grad_norm": 19.432321548461914, + "learning_rate": 1e-06, + "loss": 0.1783, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.306640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6080.0, + "completions/mean_length": 6120.22265625, + "completions/mean_terminated_length": 1581.0308837890625, + "completions/min_length": 403.0, + "completions/min_terminated_length": 403.0, + "entropy": 0.17949867248535156, + "epoch": 0.25801282051282054, + "frac_reward_zero_std": 0.0, + "grad_norm": 459.5679626464844, + "learning_rate": 1e-06, + "loss": 0.2659, + "num_tokens": 185044895.0, + "reward": 0.27867066860198975, + "reward_std": 0.21932919323444366, + "rewards/progression_diversity/mean": -0.12316776067018509, + "rewards/progression_diversity/std": 0.19920648634433746, + "rewards/symbolic_reward_accuracy/mean": 0.2265625, + "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, + "rewards/symbolic_reward_partial_score/mean": 0.5664713382720947, + "rewards/symbolic_reward_partial_score/std": 0.3668426275253296, + "rewards/tag_count_reward/mean": -0.259765625, + "rewards/tag_count_reward/std": 0.4389347732067108, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9886577129364014, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 476.0, + "sampling/sampling_logp_difference/mean": 14.280139923095703, + "step": 161 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.17472190409898758, + "epoch": 0.25961538461538464, + "grad_norm": 1472.97216796875, + "learning_rate": 1e-06, + "loss": 0.2911, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.17166371643543243, + "epoch": 0.26121794871794873, + "grad_norm": 229.58348083496094, + "learning_rate": 1e-06, + "loss": 0.2621, + "step": 163 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.20802847295999527, + "epoch": 0.26282051282051283, + "grad_norm": 183.23561096191406, + "learning_rate": 1e-06, + "loss": 0.2019, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.208984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6132.0, + "completions/mean_length": 4608.12890625, + "completions/mean_terminated_length": 1496.972900390625, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "entropy": 0.20654169470071793, + "epoch": 0.2644230769230769, + "frac_reward_zero_std": 0.0, + "grad_norm": 412.5071716308594, + "learning_rate": 1e-06, + "loss": 0.2796, + "num_tokens": 188104865.0, + "reward": 0.3180442452430725, + "reward_std": 0.21282601356506348, + "rewards/progression_diversity/mean": -0.08571553975343704, + "rewards/progression_diversity/std": 0.17538563907146454, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.6339681148529053, + "rewards/symbolic_reward_partial_score/std": 0.34711363911628723, + "rewards/tag_count_reward/mean": -0.189453125, + "rewards/tag_count_reward/std": 0.3922513723373413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9954339265823364, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 476.0, + "sampling/sampling_logp_difference/mean": 13.250750541687012, + "step": 165 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2085501253604889, + "epoch": 0.266025641025641, + "grad_norm": 412.70355224609375, + "learning_rate": 1e-06, + "loss": 0.2375, + "step": 166 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.21886557340621948, + "epoch": 0.2676282051282051, + "grad_norm": 369.0177307128906, + "learning_rate": 1e-06, + "loss": 0.1979, + "step": 167 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.2201170101761818, + "epoch": 0.2692307692307692, + "grad_norm": 0.01998511515557766, + "learning_rate": 1e-06, + "loss": 0.2208, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.24609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4548.0, + "completions/mean_length": 5164.029296875, + "completions/mean_terminated_length": 1501.5517578125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "entropy": 0.19382669776678085, + "epoch": 0.2708333333333333, + "frac_reward_zero_std": 0.0, + "grad_norm": 395.1278991699219, + "learning_rate": 1e-06, + "loss": 0.2256, + "num_tokens": 191630160.0, + "reward": 0.29971635341644287, + "reward_std": 0.2082447111606598, + "rewards/progression_diversity/mean": -0.09819044172763824, + "rewards/progression_diversity/std": 0.18440598249435425, + "rewards/symbolic_reward_accuracy/mean": 0.228515625, + "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, + "rewards/symbolic_reward_partial_score/mean": 0.614306628704071, + "rewards/symbolic_reward_partial_score/std": 0.3599141538143158, + "rewards/tag_count_reward/mean": -0.20703125, + "rewards/tag_count_reward/std": 0.40557438135147095, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9958318471908569, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 480.0, + "sampling/sampling_logp_difference/mean": 13.064840316772461, + "step": 169 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.18345515429973602, + "epoch": 0.2724358974358974, + "grad_norm": 296.6773986816406, + "learning_rate": 1e-06, + "loss": 0.2652, + "step": 170 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.65625, + "entropy": 0.2129678875207901, + "epoch": 0.27403846153846156, + "grad_norm": 5.62185001373291, + "learning_rate": 1e-06, + "loss": 0.2223, + "step": 171 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.2066056728363037, + "epoch": 0.27564102564102566, + "grad_norm": 7.7060699462890625, + "learning_rate": 1e-06, + "loss": 0.225, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.240234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4527.0, + "completions/mean_length": 5134.7578125, + "completions/mean_terminated_length": 1577.7994384765625, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "entropy": 0.18826917558908463, + "epoch": 0.27724358974358976, + "frac_reward_zero_std": 0.03125, + "grad_norm": 683.8331298828125, + "learning_rate": 1e-06, + "loss": 0.2989, + "num_tokens": 195087860.0, + "reward": 0.2965736985206604, + "reward_std": 0.2112705409526825, + "rewards/progression_diversity/mean": -0.09214277565479279, + "rewards/progression_diversity/std": 0.17536333203315735, + "rewards/symbolic_reward_accuracy/mean": 0.224609375, + "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, + "rewards/symbolic_reward_partial_score/mean": 0.6068847179412842, + "rewards/symbolic_reward_partial_score/std": 0.3553755581378937, + "rewards/tag_count_reward/mean": -0.193359375, + "rewards/tag_count_reward/std": 0.39531853795051575, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0022835731506348, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 480.0, + "sampling/sampling_logp_difference/mean": 11.84273910522461, + "step": 173 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.20123252272605896, + "epoch": 0.27884615384615385, + "grad_norm": 212.23048400878906, + "learning_rate": 1e-06, + "loss": 0.2336, + "step": 174 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.20349763333797455, + "epoch": 0.28044871794871795, + "grad_norm": 150.0238037109375, + "learning_rate": 1e-06, + "loss": 0.2346, + "step": 175 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.22359148412942886, + "epoch": 0.28205128205128205, + "grad_norm": 88.31288146972656, + "learning_rate": 1e-06, + "loss": 0.1458, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12719.0, + "completions/mean_length": 5528.5, + "completions/mean_terminated_length": 1602.04248046875, + "completions/min_length": 312.0, + "completions/min_terminated_length": 312.0, + "entropy": 0.19052383303642273, + "epoch": 0.28365384615384615, + "frac_reward_zero_std": 0.0, + "grad_norm": 955.2317504882812, + "learning_rate": 1e-06, + "loss": 0.241, + "num_tokens": 198768004.0, + "reward": 0.2399653196334839, + "reward_std": 0.19414269924163818, + "rewards/progression_diversity/mean": -0.10552017390727997, + "rewards/progression_diversity/std": 0.18494197726249695, + "rewards/symbolic_reward_accuracy/mean": 0.16015625, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.5618652105331421, + "rewards/symbolic_reward_partial_score/std": 0.350735604763031, + "rewards/tag_count_reward/mean": -0.236328125, + "rewards/tag_count_reward/std": 0.42524150013923645, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989559531211853, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 482.0, + "sampling/sampling_logp_difference/mean": 14.195697784423828, + "step": 177 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.17670485377311707, + "epoch": 0.28525641025641024, + "grad_norm": 969.2748413085938, + "learning_rate": 1e-06, + "loss": 0.3582, + "step": 178 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.18852558732032776, + "epoch": 0.28685897435897434, + "grad_norm": 0.01699661649763584, + "learning_rate": 1e-06, + "loss": 0.2577, + "step": 179 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6015625, + "entropy": 0.19296500831842422, + "epoch": 0.28846153846153844, + "grad_norm": 0.020286090672016144, + "learning_rate": 1e-06, + "loss": 0.2662, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5431.0, + "completions/mean_length": 5255.9375, + "completions/mean_terminated_length": 1546.5833740234375, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.1944911777973175, + "epoch": 0.2900641025641026, + "frac_reward_zero_std": 0.03125, + "grad_norm": 478.534423828125, + "learning_rate": 1e-06, + "loss": 0.2062, + "num_tokens": 202400020.0, + "reward": 0.23154094815254211, + "reward_std": 0.18954092264175415, + "rewards/progression_diversity/mean": -0.09737183153629303, + "rewards/progression_diversity/std": 0.1798097938299179, + "rewards/symbolic_reward_accuracy/mean": 0.134765625, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.5712727904319763, + "rewards/symbolic_reward_partial_score/std": 0.3447929322719574, + "rewards/tag_count_reward/mean": -0.197265625, + "rewards/tag_count_reward/std": 0.3983237147331238, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9921056032180786, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 482.0, + "sampling/sampling_logp_difference/mean": 13.62202262878418, + "step": 181 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.20868180692195892, + "epoch": 0.2916666666666667, + "grad_norm": 51.3494758605957, + "learning_rate": 1e-06, + "loss": 0.2408, + "step": 182 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.18569938838481903, + "epoch": 0.2932692307692308, + "grad_norm": 55.675235748291016, + "learning_rate": 1e-06, + "loss": 0.268, + "step": 183 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.1851130947470665, + "epoch": 0.2948717948717949, + "grad_norm": 65.24762725830078, + "learning_rate": 1e-06, + "loss": 0.3019, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5164.0, + "completions/mean_length": 4260.142578125, + "completions/mean_terminated_length": 1462.329345703125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.2189076542854309, + "epoch": 0.296474358974359, + "frac_reward_zero_std": 0.09375, + "grad_norm": 344.04833984375, + "learning_rate": 1e-06, + "loss": 0.2133, + "num_tokens": 205497869.0, + "reward": 0.31361886858940125, + "reward_std": 0.19572681188583374, + "rewards/progression_diversity/mean": -0.07756727933883667, + "rewards/progression_diversity/std": 0.17236009240150452, + "rewards/symbolic_reward_accuracy/mean": 0.234375, + "rewards/symbolic_reward_accuracy/std": 0.42402184009552, + "rewards/symbolic_reward_partial_score/mean": 0.6332682371139526, + "rewards/symbolic_reward_partial_score/std": 0.3539099097251892, + "rewards/tag_count_reward/mean": -0.162109375, + "rewards/tag_count_reward/std": 0.3689115643501282, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0022011995315552, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 484.0, + "sampling/sampling_logp_difference/mean": 12.16126823425293, + "step": 185 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.22352929413318634, + "epoch": 0.2980769230769231, + "grad_norm": 37.45823669433594, + "learning_rate": 1e-06, + "loss": 0.2166, + "step": 186 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.2236320823431015, + "epoch": 0.29967948717948717, + "grad_norm": 4.9470672607421875, + "learning_rate": 1e-06, + "loss": 0.1972, + "step": 187 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.22875476628541946, + "epoch": 0.30128205128205127, + "grad_norm": 0.024561388418078423, + "learning_rate": 1e-06, + "loss": 0.2051, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4226.0, + "completions/mean_length": 3966.052734375, + "completions/mean_terminated_length": 1388.742919921875, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.22446440160274506, + "epoch": 0.30288461538461536, + "frac_reward_zero_std": 0.0, + "grad_norm": 2142.739013671875, + "learning_rate": 1e-06, + "loss": 0.2362, + "num_tokens": 208400968.0, + "reward": 0.2870832681655884, + "reward_std": 0.18633730709552765, + "rewards/progression_diversity/mean": -0.07292823493480682, + "rewards/progression_diversity/std": 0.16945891082286835, + "rewards/symbolic_reward_accuracy/mean": 0.19140625, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.6227864623069763, + "rewards/symbolic_reward_partial_score/std": 0.33123111724853516, + "rewards/tag_count_reward/mean": -0.138671875, + "rewards/tag_count_reward/std": 0.34594178199768066, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0030953884124756, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 484.0, + "sampling/sampling_logp_difference/mean": 12.393356323242188, + "step": 189 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.22635683417320251, + "epoch": 0.30448717948717946, + "grad_norm": 370.2960205078125, + "learning_rate": 1e-06, + "loss": 0.2681, + "step": 190 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2461889684200287, + "epoch": 0.3060897435897436, + "grad_norm": 0.01940600946545601, + "learning_rate": 1e-06, + "loss": 0.1929, + "step": 191 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.24533099681138992, + "epoch": 0.3076923076923077, + "grad_norm": 0.031975965946912766, + "learning_rate": 1e-06, + "loss": 0.1993, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5819.0, + "completions/mean_length": 5224.373046875, + "completions/mean_terminated_length": 1426.59423828125, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.1923881694674492, + "epoch": 0.3092948717948718, + "frac_reward_zero_std": 0.0, + "grad_norm": 366.1446838378906, + "learning_rate": 1e-06, + "loss": 0.3055, + "num_tokens": 212054839.0, + "reward": 0.2073436677455902, + "reward_std": 0.1643807291984558, + "rewards/progression_diversity/mean": -0.1074308454990387, + "rewards/progression_diversity/std": 0.19538284838199615, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.5267577767372131, + "rewards/symbolic_reward_partial_score/std": 0.3376568853855133, + "rewards/tag_count_reward/mean": -0.19921875, + "rewards/tag_count_reward/std": 0.39980348944664, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9920334815979004, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 488.0, + "sampling/sampling_logp_difference/mean": 14.228421211242676, + "step": 193 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.2093525379896164, + "epoch": 0.3108974358974359, + "grad_norm": 143.63833618164062, + "learning_rate": 1e-06, + "loss": 0.22, + "step": 194 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.20566636323928833, + "epoch": 0.3125, + "grad_norm": 507.2211608886719, + "learning_rate": 1e-06, + "loss": 0.2588, + "step": 195 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.21372250467538834, + "epoch": 0.3141025641025641, + "grad_norm": 837.7913818359375, + "learning_rate": 1e-06, + "loss": 0.2089, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.22265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4510.0, + "completions/mean_length": 4631.9921875, + "completions/mean_terminated_length": 1265.8392333984375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.22118443250656128, + "epoch": 0.3157051282051282, + "frac_reward_zero_std": 0.03125, + "grad_norm": 653.4398193359375, + "learning_rate": 1e-06, + "loss": 0.2032, + "num_tokens": 215352051.0, + "reward": 0.25279223918914795, + "reward_std": 0.19195367395877838, + "rewards/progression_diversity/mean": -0.10407879203557968, + "rewards/progression_diversity/std": 0.20429863035678864, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.5993651747703552, + "rewards/symbolic_reward_partial_score/std": 0.34102150797843933, + "rewards/tag_count_reward/mean": -0.173828125, + "rewards/tag_count_reward/std": 0.3793322443962097, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9943366050720215, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 488.0, + "sampling/sampling_logp_difference/mean": 14.024613380432129, + "step": 197 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.21031095832586288, + "epoch": 0.3173076923076923, + "grad_norm": 9.880915641784668, + "learning_rate": 1e-06, + "loss": 0.2061, + "step": 198 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.21502617746591568, + "epoch": 0.3189102564102564, + "grad_norm": 159.4617156982422, + "learning_rate": 1e-06, + "loss": 0.2279, + "step": 199 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2149796113371849, + "epoch": 0.32051282051282054, + "grad_norm": 0.021712809801101685, + "learning_rate": 1e-06, + "loss": 0.2856, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.30859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5767.0, + "completions/mean_length": 5912.080078125, + "completions/mean_terminated_length": 1238.17236328125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.19441968947649002, + "epoch": 0.32211538461538464, + "frac_reward_zero_std": 0.0, + "grad_norm": 380.85797119140625, + "learning_rate": 1e-06, + "loss": 0.2383, + "num_tokens": 219297724.0, + "reward": 0.19366931915283203, + "reward_std": 0.19040024280548096, + "rewards/progression_diversity/mean": -0.148204505443573, + "rewards/progression_diversity/std": 0.2301713079214096, + "rewards/symbolic_reward_accuracy/mean": 0.099609375, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.528759777545929, + "rewards/symbolic_reward_partial_score/std": 0.35425814986228943, + "rewards/tag_count_reward/mean": -0.232421875, + "rewards/tag_count_reward/std": 0.42278963327407837, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.979872465133667, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 490.0, + "sampling/sampling_logp_difference/mean": 17.00942611694336, + "step": 201 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6328125, + "entropy": 0.15824927389621735, + "epoch": 0.32371794871794873, + "grad_norm": 166.4498291015625, + "learning_rate": 1e-06, + "loss": 0.3229, + "step": 202 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.640625, + "entropy": 0.20293635874986649, + "epoch": 0.32532051282051283, + "grad_norm": 0.022664187476038933, + "learning_rate": 1e-06, + "loss": 0.2402, + "step": 203 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.671875, + "entropy": 0.1808452159166336, + "epoch": 0.3269230769230769, + "grad_norm": 0.013569237664341927, + "learning_rate": 1e-06, + "loss": 0.3128, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.26171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4302.0, + "completions/mean_length": 5205.919921875, + "completions/mean_terminated_length": 1243.320068359375, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.2147229164838791, + "epoch": 0.328525641025641, + "frac_reward_zero_std": 0.0, + "grad_norm": 922.9512939453125, + "learning_rate": 1e-06, + "loss": 0.2163, + "num_tokens": 222885587.0, + "reward": 0.2559148669242859, + "reward_std": 0.22893401980400085, + "rewards/progression_diversity/mean": -0.12775209546089172, + "rewards/progression_diversity/std": 0.2234182357788086, + "rewards/symbolic_reward_accuracy/mean": 0.17578125, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.5825684070587158, + "rewards/symbolic_reward_partial_score/std": 0.36761772632598877, + "rewards/tag_count_reward/mean": -0.23046875, + "rewards/tag_count_reward/std": 0.42154473066329956, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9854786396026611, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 492.0, + "sampling/sampling_logp_difference/mean": 16.239261627197266, + "step": 205 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.2067190259695053, + "epoch": 0.3301282051282051, + "grad_norm": 220.25572204589844, + "learning_rate": 1e-06, + "loss": 0.2657, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.22213280200958252, + "epoch": 0.3317307692307692, + "grad_norm": 127.26791381835938, + "learning_rate": 1e-06, + "loss": 0.219, + "step": 207 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.20553400367498398, + "epoch": 0.3333333333333333, + "grad_norm": 14.982529640197754, + "learning_rate": 1e-06, + "loss": 0.2511, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.283203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4364.0, + "completions/mean_length": 5600.46484375, + "completions/mean_terminated_length": 1339.9400634765625, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.18529720604419708, + "epoch": 0.3349358974358974, + "frac_reward_zero_std": 0.0, + "grad_norm": 476.10418701171875, + "learning_rate": 1e-06, + "loss": 0.325, + "num_tokens": 226612593.0, + "reward": 0.20574292540550232, + "reward_std": 0.2026432454586029, + "rewards/progression_diversity/mean": -0.12785804271697998, + "rewards/progression_diversity/std": 0.21478815376758575, + "rewards/symbolic_reward_accuracy/mean": 0.11328125, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.5507487058639526, + "rewards/symbolic_reward_partial_score/std": 0.3405325710773468, + "rewards/tag_count_reward/mean": -0.26171875, + "rewards/tag_count_reward/std": 0.44000017642974854, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.983894407749176, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 492.0, + "sampling/sampling_logp_difference/mean": 16.456192016601562, + "step": 209 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.19580994546413422, + "epoch": 0.33653846153846156, + "grad_norm": 216.3179931640625, + "learning_rate": 1e-06, + "loss": 0.2533, + "step": 210 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.19553975760936737, + "epoch": 0.33814102564102566, + "grad_norm": 1221.7452392578125, + "learning_rate": 1e-06, + "loss": 0.3216, + "step": 211 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.20991089940071106, + "epoch": 0.33974358974358976, + "grad_norm": 300.2898254394531, + "learning_rate": 1e-06, + "loss": 0.2328, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5346.0, + "completions/mean_length": 5424.576171875, + "completions/mean_terminated_length": 1460.5291748046875, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "entropy": 0.1831618994474411, + "epoch": 0.34134615384615385, + "frac_reward_zero_std": 0.0, + "grad_norm": 961.52490234375, + "learning_rate": 1e-06, + "loss": 0.3222, + "num_tokens": 230279560.0, + "reward": 0.22739171981811523, + "reward_std": 0.2230047881603241, + "rewards/progression_diversity/mean": -0.12215697765350342, + "rewards/progression_diversity/std": 0.21225501596927643, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.5380859375, + "rewards/symbolic_reward_partial_score/std": 0.3611067831516266, + "rewards/tag_count_reward/mean": -0.2421875, + "rewards/tag_count_reward/std": 0.42882615327835083, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.986311137676239, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 494.0, + "sampling/sampling_logp_difference/mean": 16.12148666381836, + "step": 213 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.22351820021867752, + "epoch": 0.34294871794871795, + "grad_norm": 112.74642181396484, + "learning_rate": 1e-06, + "loss": 0.2065, + "step": 214 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.20397323369979858, + "epoch": 0.34455128205128205, + "grad_norm": 106.15531921386719, + "learning_rate": 1e-06, + "loss": 0.2369, + "step": 215 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6328125, + "entropy": 0.17963356524705887, + "epoch": 0.34615384615384615, + "grad_norm": 2.7653439044952393, + "learning_rate": 1e-06, + "loss": 0.2618, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.251953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4075.0, + "completions/mean_length": 5181.009765625, + "completions/mean_terminated_length": 1407.6788330078125, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.21698909252882004, + "epoch": 0.34775641025641024, + "frac_reward_zero_std": 0.0, + "grad_norm": 2655.067138671875, + "learning_rate": 1e-06, + "loss": 0.1933, + "num_tokens": 233824525.0, + "reward": 0.2999926805496216, + "reward_std": 0.2547125220298767, + "rewards/progression_diversity/mean": -0.12036378681659698, + "rewards/progression_diversity/std": 0.2145136296749115, + "rewards/symbolic_reward_accuracy/mean": 0.248046875, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.578857421875, + "rewards/symbolic_reward_partial_score/std": 0.3651575446128845, + "rewards/tag_count_reward/mean": -0.212890625, + "rewards/tag_count_reward/std": 0.409751296043396, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898908734321594, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 496.0, + "sampling/sampling_logp_difference/mean": 15.950399398803711, + "step": 217 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.19810165464878082, + "epoch": 0.34935897435897434, + "grad_norm": 90.42063903808594, + "learning_rate": 1e-06, + "loss": 0.2934, + "step": 218 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.21933219581842422, + "epoch": 0.35096153846153844, + "grad_norm": 158.33421325683594, + "learning_rate": 1e-06, + "loss": 0.1813, + "step": 219 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.22585007548332214, + "epoch": 0.3525641025641026, + "grad_norm": 114.27921295166016, + "learning_rate": 1e-06, + "loss": 0.2229, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.259765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4320.0, + "completions/mean_length": 5331.865234375, + "completions/mean_terminated_length": 1453.41162109375, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.20159848034381866, + "epoch": 0.3541666666666667, + "frac_reward_zero_std": 0.0625, + "grad_norm": 953.3041381835938, + "learning_rate": 1e-06, + "loss": 0.2609, + "num_tokens": 237447016.0, + "reward": 0.2347480058670044, + "reward_std": 0.1780080795288086, + "rewards/progression_diversity/mean": -0.12334457039833069, + "rewards/progression_diversity/std": 0.21702897548675537, + "rewards/symbolic_reward_accuracy/mean": 0.146484375, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.5691568851470947, + "rewards/symbolic_reward_partial_score/std": 0.3390918970108032, + "rewards/tag_count_reward/mean": -0.2265625, + "rewards/tag_count_reward/std": 0.4190165400505066, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9891093969345093, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 496.0, + "sampling/sampling_logp_difference/mean": 16.061904907226562, + "step": 221 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.20780060440301895, + "epoch": 0.3557692307692308, + "grad_norm": 171.24063110351562, + "learning_rate": 1e-06, + "loss": 0.2021, + "step": 222 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.19532399624586105, + "epoch": 0.3573717948717949, + "grad_norm": 127.67808532714844, + "learning_rate": 1e-06, + "loss": 0.2701, + "step": 223 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2121635228395462, + "epoch": 0.358974358974359, + "grad_norm": 36.59706497192383, + "learning_rate": 1e-06, + "loss": 0.19, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3714.0, + "completions/mean_length": 4855.25390625, + "completions/mean_terminated_length": 1326.0458984375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "entropy": 0.17904973775148392, + "epoch": 0.3605769230769231, + "frac_reward_zero_std": 0.0, + "grad_norm": 378.9632568359375, + "learning_rate": 1e-06, + "loss": 0.3551, + "num_tokens": 240758634.0, + "reward": 0.26461654901504517, + "reward_std": 0.2048269361257553, + "rewards/progression_diversity/mean": -0.11061383038759232, + "rewards/progression_diversity/std": 0.20703071355819702, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.5862630605697632, + "rewards/symbolic_reward_partial_score/std": 0.34296321868896484, + "rewards/tag_count_reward/mean": -0.203125, + "rewards/tag_count_reward/std": 0.4027182459831238, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9891663789749146, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 496.0, + "sampling/sampling_logp_difference/mean": 16.460721969604492, + "step": 225 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.20133854448795319, + "epoch": 0.36217948717948717, + "grad_norm": 1110.4901123046875, + "learning_rate": 1e-06, + "loss": 0.2465, + "step": 226 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.22215987741947174, + "epoch": 0.36378205128205127, + "grad_norm": 26.63279151916504, + "learning_rate": 1e-06, + "loss": 0.2531, + "step": 227 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.23379182070493698, + "epoch": 0.36538461538461536, + "grad_norm": 0.5691369771957397, + "learning_rate": 1e-06, + "loss": 0.1706, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3607.0, + "completions/mean_length": 4385.185546875, + "completions/mean_terminated_length": 1326.664306640625, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.23052329570055008, + "epoch": 0.36698717948717946, + "frac_reward_zero_std": 0.0, + "grad_norm": 360.1962585449219, + "learning_rate": 1e-06, + "loss": 0.2534, + "num_tokens": 243786505.0, + "reward": 0.3250322937965393, + "reward_std": 0.22688430547714233, + "rewards/progression_diversity/mean": -0.09101086109876633, + "rewards/progression_diversity/std": 0.18869265913963318, + "rewards/symbolic_reward_accuracy/mean": 0.25390625, + "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, + "rewards/symbolic_reward_partial_score/mean": 0.6346517205238342, + "rewards/symbolic_reward_partial_score/std": 0.33860599994659424, + "rewards/tag_count_reward/mean": -0.16796875, + "rewards/tag_count_reward/std": 0.374204158782959, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9971761703491211, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 498.0, + "sampling/sampling_logp_difference/mean": 14.882648468017578, + "step": 229 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2123500406742096, + "epoch": 0.3685897435897436, + "grad_norm": 343.66748046875, + "learning_rate": 1e-06, + "loss": 0.2335, + "step": 230 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.24668648838996887, + "epoch": 0.3701923076923077, + "grad_norm": 318.5328674316406, + "learning_rate": 1e-06, + "loss": 0.1648, + "step": 231 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.266249381005764, + "epoch": 0.3717948717948718, + "grad_norm": 0.021640097722411156, + "learning_rate": 1e-06, + "loss": 0.163, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.232421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4106.0, + "completions/mean_length": 4834.337890625, + "completions/mean_terminated_length": 1337.1119384765625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.22328076511621475, + "epoch": 0.3733974358974359, + "frac_reward_zero_std": 0.0, + "grad_norm": 1194.5416259765625, + "learning_rate": 1e-06, + "loss": 0.2147, + "num_tokens": 247135110.0, + "reward": 0.28227871656417847, + "reward_std": 0.21940109133720398, + "rewards/progression_diversity/mean": -0.10660064220428467, + "rewards/progression_diversity/std": 0.20069687068462372, + "rewards/symbolic_reward_accuracy/mean": 0.201171875, + "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, + "rewards/symbolic_reward_partial_score/mean": 0.6026855707168579, + "rewards/symbolic_reward_partial_score/std": 0.3378095328807831, + "rewards/tag_count_reward/mean": -0.181640625, + "rewards/tag_count_reward/std": 0.38592514395713806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9878015518188477, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 500.0, + "sampling/sampling_logp_difference/mean": 16.46212387084961, + "step": 233 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2334306240081787, + "epoch": 0.375, + "grad_norm": 738.572509765625, + "learning_rate": 1e-06, + "loss": 0.2937, + "step": 234 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.21482889354228973, + "epoch": 0.3766025641025641, + "grad_norm": 477.08978271484375, + "learning_rate": 1e-06, + "loss": 0.274, + "step": 235 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.21712397038936615, + "epoch": 0.3782051282051282, + "grad_norm": 372.0608215332031, + "learning_rate": 1e-06, + "loss": 0.33, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3953.0, + "completions/mean_length": 5198.51953125, + "completions/mean_terminated_length": 1391.9423828125, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "entropy": 0.1952293962240219, + "epoch": 0.3798076923076923, + "frac_reward_zero_std": 0.0, + "grad_norm": 534.4794311523438, + "learning_rate": 1e-06, + "loss": 0.223, + "num_tokens": 250668384.0, + "reward": 0.20283983647823334, + "reward_std": 0.18369260430335999, + "rewards/progression_diversity/mean": -0.1090836450457573, + "rewards/progression_diversity/std": 0.19674594700336456, + "rewards/symbolic_reward_accuracy/mean": 0.09765625, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.553466796875, + "rewards/symbolic_reward_partial_score/std": 0.3304186761379242, + "rewards/tag_count_reward/mean": -0.20703125, + "rewards/tag_count_reward/std": 0.40557438135147095, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892090559005737, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 500.0, + "sampling/sampling_logp_difference/mean": 16.296913146972656, + "step": 237 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.2330704852938652, + "epoch": 0.3814102564102564, + "grad_norm": 31.066238403320312, + "learning_rate": 1e-06, + "loss": 0.2238, + "step": 238 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.21373382210731506, + "epoch": 0.38301282051282054, + "grad_norm": 120.66889953613281, + "learning_rate": 1e-06, + "loss": 0.2189, + "step": 239 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.19722618907690048, + "epoch": 0.38461538461538464, + "grad_norm": 12.67758560180664, + "learning_rate": 1e-06, + "loss": 0.2972, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.19921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4564.0, + "completions/mean_length": 4309.216796875, + "completions/mean_terminated_length": 1305.246337890625, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 0.23315541446208954, + "epoch": 0.38621794871794873, + "frac_reward_zero_std": 0.03125, + "grad_norm": 488.22998046875, + "learning_rate": 1e-06, + "loss": 0.1621, + "num_tokens": 253703711.0, + "reward": 0.26993727684020996, + "reward_std": 0.21065741777420044, + "rewards/progression_diversity/mean": -0.09562888741493225, + "rewards/progression_diversity/std": 0.19961923360824585, + "rewards/symbolic_reward_accuracy/mean": 0.171875, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.6106607913970947, + "rewards/symbolic_reward_partial_score/std": 0.32578223943710327, + "rewards/tag_count_reward/mean": -0.154296875, + "rewards/tag_count_reward/std": 0.36158639192581177, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.994120717048645, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 504.0, + "sampling/sampling_logp_difference/mean": 15.699442863464355, + "step": 241 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.22725315392017365, + "epoch": 0.38782051282051283, + "grad_norm": 728.95068359375, + "learning_rate": 1e-06, + "loss": 0.2635, + "step": 242 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.24956174194812775, + "epoch": 0.3894230769230769, + "grad_norm": 216.6647491455078, + "learning_rate": 1e-06, + "loss": 0.162, + "step": 243 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.22464978694915771, + "epoch": 0.391025641025641, + "grad_norm": 351.8695373535156, + "learning_rate": 1e-06, + "loss": 0.241, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.236328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4847.0, + "completions/mean_length": 4997.755859375, + "completions/mean_terminated_length": 1474.135498046875, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.20301750302314758, + "epoch": 0.3926282051282051, + "frac_reward_zero_std": 0.0, + "grad_norm": 481.3808898925781, + "learning_rate": 1e-06, + "loss": 0.2076, + "num_tokens": 257209026.0, + "reward": 0.2624889016151428, + "reward_std": 0.19924888014793396, + "rewards/progression_diversity/mean": -0.10560305416584015, + "rewards/progression_diversity/std": 0.1964842677116394, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.5640299320220947, + "rewards/symbolic_reward_partial_score/std": 0.3424655795097351, + "rewards/tag_count_reward/mean": -0.181640625, + "rewards/tag_count_reward/std": 0.38592514395713806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906989336013794, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 504.0, + "sampling/sampling_logp_difference/mean": 16.07154083251953, + "step": 245 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2440878227353096, + "epoch": 0.3942307692307692, + "grad_norm": 111.19556427001953, + "learning_rate": 1e-06, + "loss": 0.1726, + "step": 246 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.22275415807962418, + "epoch": 0.3958333333333333, + "grad_norm": 237.71017456054688, + "learning_rate": 1e-06, + "loss": 0.2368, + "step": 247 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.3984375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.2082279548048973, + "epoch": 0.3974358974358974, + "grad_norm": 15.373961448669434, + "learning_rate": 1e-06, + "loss": 0.3212, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4378.0, + "completions/mean_length": 5073.48828125, + "completions/mean_terminated_length": 1458.78857421875, + "completions/min_length": 371.0, + "completions/min_terminated_length": 371.0, + "entropy": 0.2016061171889305, + "epoch": 0.39903846153846156, + "frac_reward_zero_std": 0.0, + "grad_norm": 1202.8836669921875, + "learning_rate": 1e-06, + "loss": 0.2794, + "num_tokens": 260669868.0, + "reward": 0.18259669840335846, + "reward_std": 0.16573688387870789, + "rewards/progression_diversity/mean": -0.1065426617860794, + "rewards/progression_diversity/std": 0.19588853418827057, + "rewards/symbolic_reward_accuracy/mean": 0.068359375, + "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, + "rewards/symbolic_reward_partial_score/mean": 0.5438476204872131, + "rewards/symbolic_reward_partial_score/std": 0.32987871766090393, + "rewards/tag_count_reward/mean": -0.205078125, + "rewards/tag_count_reward/std": 0.4041535556316376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889222383499146, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 504.0, + "sampling/sampling_logp_difference/mean": 16.378686904907227, + "step": 249 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.2196366786956787, + "epoch": 0.40064102564102566, + "grad_norm": 56.70030975341797, + "learning_rate": 1e-06, + "loss": 0.2487, + "step": 250 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.2119002267718315, + "epoch": 0.40224358974358976, + "grad_norm": 3.978065013885498, + "learning_rate": 1e-06, + "loss": 0.2374, + "step": 251 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.21355099976062775, + "epoch": 0.40384615384615385, + "grad_norm": 0.02542886696755886, + "learning_rate": 1e-06, + "loss": 0.2578, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3575.0, + "completions/mean_length": 4913.615234375, + "completions/mean_terminated_length": 1402.27294921875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.22765664756298065, + "epoch": 0.40544871794871795, + "frac_reward_zero_std": 0.0, + "grad_norm": 457.21380615234375, + "learning_rate": 1e-06, + "loss": 0.1611, + "num_tokens": 264010663.0, + "reward": 0.2566929757595062, + "reward_std": 0.22699853777885437, + "rewards/progression_diversity/mean": -0.10218832641839981, + "rewards/progression_diversity/std": 0.19470685720443726, + "rewards/symbolic_reward_accuracy/mean": 0.177734375, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.5693359375, + "rewards/symbolic_reward_partial_score/std": 0.34047579765319824, + "rewards/tag_count_reward/mean": -0.197265625, + "rewards/tag_count_reward/std": 0.3983237147331238, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911985397338867, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 16.304428100585938, + "step": 253 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.21794138103723526, + "epoch": 0.40705128205128205, + "grad_norm": 297.2830505371094, + "learning_rate": 1e-06, + "loss": 0.3116, + "step": 254 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.3515625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.2055223435163498, + "epoch": 0.40865384615384615, + "grad_norm": 0.2487526535987854, + "learning_rate": 1e-06, + "loss": 0.2665, + "step": 255 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.21468888968229294, + "epoch": 0.41025641025641024, + "grad_norm": 0.02222493290901184, + "learning_rate": 1e-06, + "loss": 0.2176, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.244140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3774.0, + "completions/mean_length": 5068.212890625, + "completions/mean_terminated_length": 1413.242919921875, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.21585044264793396, + "epoch": 0.41185897435897434, + "frac_reward_zero_std": 0.0, + "grad_norm": 821.6210327148438, + "learning_rate": 1e-06, + "loss": 0.2695, + "num_tokens": 267516868.0, + "reward": 0.2098761796951294, + "reward_std": 0.16895067691802979, + "rewards/progression_diversity/mean": -0.11101497709751129, + "rewards/progression_diversity/std": 0.20189963281154633, + "rewards/symbolic_reward_accuracy/mean": 0.11328125, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.5431314706802368, + "rewards/symbolic_reward_partial_score/std": 0.334158331155777, + "rewards/tag_count_reward/mean": -0.19921875, + "rewards/tag_count_reward/std": 0.39980348944664, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911006689071655, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 17.146984100341797, + "step": 257 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.22612430155277252, + "epoch": 0.41346153846153844, + "grad_norm": 543.9043579101562, + "learning_rate": 1e-06, + "loss": 0.3986, + "step": 258 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.244947150349617, + "epoch": 0.4150641025641026, + "grad_norm": 313.92132568359375, + "learning_rate": 1e-06, + "loss": 0.1894, + "step": 259 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6484375, + "entropy": 0.2207338884472847, + "epoch": 0.4166666666666667, + "grad_norm": 0.4175088405609131, + "learning_rate": 1e-06, + "loss": 0.2577, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5952.0, + "completions/mean_length": 5339.615234375, + "completions/mean_terminated_length": 1503.144775390625, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.23514293134212494, + "epoch": 0.4182692307692308, + "frac_reward_zero_std": 0.03125, + "grad_norm": 860.50244140625, + "learning_rate": 1e-06, + "loss": 0.2181, + "num_tokens": 271111039.0, + "reward": 0.2324339747428894, + "reward_std": 0.19143269956111908, + "rewards/progression_diversity/mean": -0.11207312345504761, + "rewards/progression_diversity/std": 0.19719642400741577, + "rewards/symbolic_reward_accuracy/mean": 0.16015625, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.524609386920929, + "rewards/symbolic_reward_partial_score/std": 0.3499920070171356, + "rewards/tag_count_reward/mean": -0.19921875, + "rewards/tag_count_reward/std": 0.39980348944664, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9931377172470093, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 16.724380493164062, + "step": 261 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.19652889668941498, + "epoch": 0.4198717948717949, + "grad_norm": 72.61575317382812, + "learning_rate": 1e-06, + "loss": 0.2661, + "step": 262 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2538982629776001, + "epoch": 0.421474358974359, + "grad_norm": 1.3014869689941406, + "learning_rate": 1e-06, + "loss": 0.1651, + "step": 263 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.3515625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.21324076503515244, + "epoch": 0.4230769230769231, + "grad_norm": 7.824087142944336, + "learning_rate": 1e-06, + "loss": 0.3078, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.23046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4048.0, + "completions/mean_length": 4864.2734375, + "completions/mean_terminated_length": 1414.2030029296875, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.24157585948705673, + "epoch": 0.42467948717948717, + "frac_reward_zero_std": 0.0, + "grad_norm": 681.8773193359375, + "learning_rate": 1e-06, + "loss": 0.2296, + "num_tokens": 274522011.0, + "reward": 0.23941701650619507, + "reward_std": 0.200226292014122, + "rewards/progression_diversity/mean": -0.0978497862815857, + "rewards/progression_diversity/std": 0.1862473040819168, + "rewards/symbolic_reward_accuracy/mean": 0.1484375, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.5630371570587158, + "rewards/symbolic_reward_partial_score/std": 0.34202542901039124, + "rewards/tag_count_reward/mean": -0.17578125, + "rewards/tag_count_reward/std": 0.3810062110424042, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9933131337165833, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 16.649776458740234, + "step": 265 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.25425516068935394, + "epoch": 0.42628205128205127, + "grad_norm": 0.02002408355474472, + "learning_rate": 1e-06, + "loss": 0.2305, + "step": 266 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6484375, + "entropy": 0.22516103833913803, + "epoch": 0.42788461538461536, + "grad_norm": 0.027018295601010323, + "learning_rate": 1e-06, + "loss": 0.2392, + "step": 267 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.237248495221138, + "epoch": 0.42948717948717946, + "grad_norm": 2.79491925239563, + "learning_rate": 1e-06, + "loss": 0.2575, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4029.0, + "completions/mean_length": 4960.13671875, + "completions/mean_terminated_length": 1463.03564453125, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.24601341784000397, + "epoch": 0.4310897435897436, + "frac_reward_zero_std": 0.0, + "grad_norm": 1408.3753662109375, + "learning_rate": 1e-06, + "loss": 0.2201, + "num_tokens": 278050673.0, + "reward": 0.20770949125289917, + "reward_std": 0.19745418429374695, + "rewards/progression_diversity/mean": -0.10600487887859344, + "rewards/progression_diversity/std": 0.1964379996061325, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.5253255367279053, + "rewards/symbolic_reward_partial_score/std": 0.3296595513820648, + "rewards/tag_count_reward/mean": -0.19140625, + "rewards/tag_count_reward/std": 0.3937928080558777, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925892949104309, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 16.926132202148438, + "step": 269 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.24742096662521362, + "epoch": 0.4326923076923077, + "grad_norm": 0.18998423218727112, + "learning_rate": 1e-06, + "loss": 0.1687, + "step": 270 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.23993898928165436, + "epoch": 0.4342948717948718, + "grad_norm": 0.016234349459409714, + "learning_rate": 1e-06, + "loss": 0.3282, + "step": 271 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.22421810775995255, + "epoch": 0.4358974358974359, + "grad_norm": 0.023402217775583267, + "learning_rate": 1e-06, + "loss": 0.2581, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5016.0, + "completions/mean_length": 4715.15234375, + "completions/mean_terminated_length": 1595.7574462890625, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.2802828401327133, + "epoch": 0.4375, + "frac_reward_zero_std": 0.0, + "grad_norm": 389.8497619628906, + "learning_rate": 1e-06, + "loss": 0.1786, + "num_tokens": 281394383.0, + "reward": 0.2193732112646103, + "reward_std": 0.20153236389160156, + "rewards/progression_diversity/mean": -0.09344275295734406, + "rewards/progression_diversity/std": 0.18624533712863922, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.5436035394668579, + "rewards/symbolic_reward_partial_score/std": 0.3351518511772156, + "rewards/tag_count_reward/mean": -0.177734375, + "rewards/tag_count_reward/std": 0.3826628625392914, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0049525499343872, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 14.498746871948242, + "step": 273 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.25365348160266876, + "epoch": 0.4391025641025641, + "grad_norm": 250.75274658203125, + "learning_rate": 1e-06, + "loss": 0.2038, + "step": 274 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.2545860484242439, + "epoch": 0.4407051282051282, + "grad_norm": 0.01937059499323368, + "learning_rate": 1e-06, + "loss": 0.2308, + "step": 275 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.23184729367494583, + "epoch": 0.4423076923076923, + "grad_norm": 0.2053852677345276, + "learning_rate": 1e-06, + "loss": 0.2823, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.251953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4437.0, + "completions/mean_length": 5307.53125, + "completions/mean_terminated_length": 1576.814697265625, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.22293350100517273, + "epoch": 0.4439102564102564, + "frac_reward_zero_std": 0.0, + "grad_norm": 594.2986450195312, + "learning_rate": 1e-06, + "loss": 0.3091, + "num_tokens": 285031903.0, + "reward": 0.19161370396614075, + "reward_std": 0.18198563158512115, + "rewards/progression_diversity/mean": -0.11011378467082977, + "rewards/progression_diversity/std": 0.1955021768808365, + "rewards/symbolic_reward_accuracy/mean": 0.091796875, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.5317057371139526, + "rewards/symbolic_reward_partial_score/std": 0.3342364728450775, + "rewards/tag_count_reward/mean": -0.21875, + "rewards/tag_count_reward/std": 0.41380295157432556, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914793372154236, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 17.115394592285156, + "step": 277 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2029179409146309, + "epoch": 0.44551282051282054, + "grad_norm": 56.31192398071289, + "learning_rate": 1e-06, + "loss": 0.3572, + "step": 278 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3828125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6328125, + "entropy": 0.25365860760211945, + "epoch": 0.44711538461538464, + "grad_norm": 0.015985539183020592, + "learning_rate": 1e-06, + "loss": 0.1795, + "step": 279 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.25643934309482574, + "epoch": 0.44871794871794873, + "grad_norm": 8.044807434082031, + "learning_rate": 1e-06, + "loss": 0.2118, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.236328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5886.0, + "completions/mean_length": 5134.87890625, + "completions/mean_terminated_length": 1653.693115234375, + "completions/min_length": 457.0, + "completions/min_terminated_length": 457.0, + "entropy": 0.27302658557891846, + "epoch": 0.45032051282051283, + "frac_reward_zero_std": 0.0, + "grad_norm": 5191.970703125, + "learning_rate": 1e-06, + "loss": 0.1912, + "num_tokens": 288559489.0, + "reward": 0.20244288444519043, + "reward_std": 0.1676143854856491, + "rewards/progression_diversity/mean": -0.10288003087043762, + "rewards/progression_diversity/std": 0.19318129122257233, + "rewards/symbolic_reward_accuracy/mean": 0.123046875, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.5037597417831421, + "rewards/symbolic_reward_partial_score/std": 0.34105032682418823, + "rewards/tag_count_reward/mean": -0.21484375, + "rewards/tag_count_reward/std": 0.4111155867576599, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008915662765503, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 15.52652359008789, + "step": 281 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.2505979463458061, + "epoch": 0.4519230769230769, + "grad_norm": 1842.8763427734375, + "learning_rate": 1e-06, + "loss": 0.2985, + "step": 282 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.3671875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.23081808537244797, + "epoch": 0.453525641025641, + "grad_norm": 0.18885204195976257, + "learning_rate": 1e-06, + "loss": 0.3087, + "step": 283 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.2553938925266266, + "epoch": 0.4551282051282051, + "grad_norm": 0.015816714614629745, + "learning_rate": 1e-06, + "loss": 0.224, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.22265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4693.0, + "completions/mean_length": 4956.9609375, + "completions/mean_terminated_length": 1683.889404296875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.2509460896253586, + "epoch": 0.4567307692307692, + "frac_reward_zero_std": 0.0, + "grad_norm": 501.21697998046875, + "learning_rate": 1e-06, + "loss": 0.2214, + "num_tokens": 291920845.0, + "reward": 0.19516658782958984, + "reward_std": 0.18093836307525635, + "rewards/progression_diversity/mean": -0.09760037809610367, + "rewards/progression_diversity/std": 0.18880048394203186, + "rewards/symbolic_reward_accuracy/mean": 0.09375, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.5301106572151184, + "rewards/symbolic_reward_partial_score/std": 0.3172612190246582, + "rewards/tag_count_reward/mean": -0.19140625, + "rewards/tag_count_reward/std": 0.3937928080558777, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9981337189674377, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 15.91734790802002, + "step": 285 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.2802078127861023, + "epoch": 0.4583333333333333, + "grad_norm": 113.05368041992188, + "learning_rate": 1e-06, + "loss": 0.2016, + "step": 286 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.65625, + "entropy": 0.24139665812253952, + "epoch": 0.4599358974358974, + "grad_norm": 0.025173721835017204, + "learning_rate": 1e-06, + "loss": 0.292, + "step": 287 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.27211469411849976, + "epoch": 0.46153846153846156, + "grad_norm": 3.040055513381958, + "learning_rate": 1e-06, + "loss": 0.2398, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.220703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5427.0, + "completions/mean_length": 4934.23046875, + "completions/mean_terminated_length": 1691.5638427734375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.2553727924823761, + "epoch": 0.46314102564102566, + "frac_reward_zero_std": 0.0, + "grad_norm": 1051.9158935546875, + "learning_rate": 1e-06, + "loss": 0.2454, + "num_tokens": 295288899.0, + "reward": 0.2229723334312439, + "reward_std": 0.21836064755916595, + "rewards/progression_diversity/mean": -0.09778685122728348, + "rewards/progression_diversity/std": 0.1902952492237091, + "rewards/symbolic_reward_accuracy/mean": 0.13671875, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.540771484375, + "rewards/symbolic_reward_partial_score/std": 0.33182287216186523, + "rewards/tag_count_reward/mean": -0.203125, + "rewards/tag_count_reward/std": 0.4027182459831238, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.004034161567688, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 14.605140686035156, + "step": 289 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.24111786484718323, + "epoch": 0.46474358974358976, + "grad_norm": 3.2952563762664795, + "learning_rate": 1e-06, + "loss": 0.2509, + "step": 290 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.2719377726316452, + "epoch": 0.46634615384615385, + "grad_norm": 0.019442997872829437, + "learning_rate": 1e-06, + "loss": 0.1819, + "step": 291 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.65625, + "entropy": 0.2737286537885666, + "epoch": 0.46794871794871795, + "grad_norm": 0.02424200251698494, + "learning_rate": 1e-06, + "loss": 0.2014, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.16015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4168.0, + "completions/mean_length": 4012.345703125, + "completions/mean_terminated_length": 1653.0999755859375, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.2804551422595978, + "epoch": 0.46955128205128205, + "frac_reward_zero_std": 0.0, + "grad_norm": 807.3659057617188, + "learning_rate": 1e-06, + "loss": 0.1918, + "num_tokens": 298231508.0, + "reward": 0.1925782412290573, + "reward_std": 0.16427671909332275, + "rewards/progression_diversity/mean": -0.07176616787910461, + "rewards/progression_diversity/std": 0.16853131353855133, + "rewards/symbolic_reward_accuracy/mean": 0.08203125, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.5264810919761658, + "rewards/symbolic_reward_partial_score/std": 0.3188028335571289, + "rewards/tag_count_reward/mean": -0.138671875, + "rewards/tag_count_reward/std": 0.34594178199768066, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0244982242584229, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 504.0, + "sampling/sampling_logp_difference/mean": 11.09280776977539, + "step": 293 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.30911885201931, + "epoch": 0.47115384615384615, + "grad_norm": 343.7190246582031, + "learning_rate": 1e-06, + "loss": 0.1829, + "step": 294 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.2846119552850723, + "epoch": 0.47275641025641024, + "grad_norm": 650.09619140625, + "learning_rate": 1e-06, + "loss": 0.2431, + "step": 295 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3162567764520645, + "epoch": 0.47435897435897434, + "grad_norm": 0.03714370355010033, + "learning_rate": 1e-06, + "loss": 0.1759, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.228515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4304.0, + "completions/mean_length": 5002.0859375, + "completions/mean_terminated_length": 1630.734130859375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.2860996425151825, + "epoch": 0.47596153846153844, + "frac_reward_zero_std": 0.0, + "grad_norm": 530.2379150390625, + "learning_rate": 1e-06, + "loss": 0.2122, + "num_tokens": 301642864.0, + "reward": 0.253789484500885, + "reward_std": 0.20528803765773773, + "rewards/progression_diversity/mean": -0.10005459934473038, + "rewards/progression_diversity/std": 0.1889607459306717, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.5315917730331421, + "rewards/symbolic_reward_partial_score/std": 0.363020122051239, + "rewards/tag_count_reward/mean": -0.18359375, + "rewards/tag_count_reward/std": 0.3875311613082886, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0061748027801514, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 14.482137680053711, + "step": 297 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.27232350409030914, + "epoch": 0.4775641025641026, + "grad_norm": 98.60669708251953, + "learning_rate": 1e-06, + "loss": 0.2171, + "step": 298 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.40625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.2478073537349701, + "epoch": 0.4791666666666667, + "grad_norm": 10.104867935180664, + "learning_rate": 1e-06, + "loss": 0.2962, + "step": 299 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.28513647615909576, + "epoch": 0.4807692307692308, + "grad_norm": 0.036038998514413834, + "learning_rate": 1e-06, + "loss": 0.2311, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.19140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4377.0, + "completions/mean_length": 4445.552734375, + "completions/mean_terminated_length": 1619.54345703125, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.3068581074476242, + "epoch": 0.4823717948717949, + "frac_reward_zero_std": 0.0, + "grad_norm": 330.840087890625, + "learning_rate": 1e-06, + "loss": 0.1263, + "num_tokens": 304794523.0, + "reward": 0.2764816880226135, + "reward_std": 0.20622991025447845, + "rewards/progression_diversity/mean": -0.08181288093328476, + "rewards/progression_diversity/std": 0.17579428851604462, + "rewards/symbolic_reward_accuracy/mean": 0.205078125, + "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, + "rewards/symbolic_reward_partial_score/mean": 0.5688639283180237, + "rewards/symbolic_reward_partial_score/std": 0.3486393988132477, + "rewards/tag_count_reward/mean": -0.1640625, + "rewards/tag_count_reward/std": 0.37069445848464966, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0138086080551147, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 508.0, + "sampling/sampling_logp_difference/mean": 13.007370948791504, + "step": 301 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2967989444732666, + "epoch": 0.483974358974359, + "grad_norm": 484.8408203125, + "learning_rate": 1e-06, + "loss": 0.2192, + "step": 302 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.26128391921520233, + "epoch": 0.4855769230769231, + "grad_norm": 281.1661376953125, + "learning_rate": 1e-06, + "loss": 0.2839, + "step": 303 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.2628230154514313, + "epoch": 0.48717948717948717, + "grad_norm": 0.027397677302360535, + "learning_rate": 1e-06, + "loss": 0.2977, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.19140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4609.0, + "completions/mean_length": 4459.234375, + "completions/mean_terminated_length": 1636.4637451171875, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.28316499292850494, + "epoch": 0.48878205128205127, + "frac_reward_zero_std": 0.03125, + "grad_norm": 847.23779296875, + "learning_rate": 1e-06, + "loss": 0.2316, + "num_tokens": 307925859.0, + "reward": 0.25070130825042725, + "reward_std": 0.16120730340480804, + "rewards/progression_diversity/mean": -0.08758464455604553, + "rewards/progression_diversity/std": 0.18611328303813934, + "rewards/symbolic_reward_accuracy/mean": 0.1640625, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.5677571296691895, + "rewards/symbolic_reward_partial_score/std": 0.333774209022522, + "rewards/tag_count_reward/mean": -0.171875, + "rewards/tag_count_reward/std": 0.3776407241821289, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0140712261199951, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 512.0, + "sampling/sampling_logp_difference/mean": 13.223786354064941, + "step": 305 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.2871243506669998, + "epoch": 0.49038461538461536, + "grad_norm": 393.9886474609375, + "learning_rate": 1e-06, + "loss": 0.2205, + "step": 306 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3013138175010681, + "epoch": 0.49198717948717946, + "grad_norm": 0.020085537806153297, + "learning_rate": 1e-06, + "loss": 0.1944, + "step": 307 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.2749166488647461, + "epoch": 0.4935897435897436, + "grad_norm": 0.01898149587213993, + "learning_rate": 1e-06, + "loss": 0.2895, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.16796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5109.0, + "completions/mean_length": 4079.05859375, + "completions/mean_terminated_length": 1594.9625244140625, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "entropy": 0.2986243665218353, + "epoch": 0.4951923076923077, + "frac_reward_zero_std": 0.0625, + "grad_norm": 579.7413330078125, + "learning_rate": 1e-06, + "loss": 0.2209, + "num_tokens": 310795841.0, + "reward": 0.24147483706474304, + "reward_std": 0.18043410778045654, + "rewards/progression_diversity/mean": -0.0688263550400734, + "rewards/progression_diversity/std": 0.15821245312690735, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.589111328125, + "rewards/symbolic_reward_partial_score/std": 0.3147651255130768, + "rewards/tag_count_reward/mean": -0.142578125, + "rewards/tag_count_reward/std": 0.3499840497970581, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0225772857666016, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 512.0, + "sampling/sampling_logp_difference/mean": 11.870687484741211, + "step": 309 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3223755210638046, + "epoch": 0.4967948717948718, + "grad_norm": 256.777099609375, + "learning_rate": 1e-06, + "loss": 0.1514, + "step": 310 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3142601400613785, + "epoch": 0.4983974358974359, + "grad_norm": 0.41229262948036194, + "learning_rate": 1e-06, + "loss": 0.1924, + "step": 311 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.29151175916194916, + "epoch": 0.5, + "grad_norm": 0.013734478503465652, + "learning_rate": 1e-06, + "loss": 0.2275, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.150390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4919.0, + "completions/mean_length": 3832.859375, + "completions/mean_terminated_length": 1611.1632080078125, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.29491689801216125, + "epoch": 0.5016025641025641, + "frac_reward_zero_std": 0.0625, + "grad_norm": 841.0869140625, + "learning_rate": 1e-06, + "loss": 0.2169, + "num_tokens": 313608521.0, + "reward": 0.32326555252075195, + "reward_std": 0.19986380636692047, + "rewards/progression_diversity/mean": -0.06456150114536285, + "rewards/progression_diversity/std": 0.15824860334396362, + "rewards/symbolic_reward_accuracy/mean": 0.251953125, + "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, + "rewards/symbolic_reward_partial_score/mean": 0.6194173097610474, + "rewards/symbolic_reward_partial_score/std": 0.3335736393928528, + "rewards/tag_count_reward/mean": -0.130859375, + "rewards/tag_count_reward/std": 0.33757632970809937, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.027578353881836, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 512.0, + "sampling/sampling_logp_difference/mean": 10.916871070861816, + "step": 313 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.33330613374710083, + "epoch": 0.5032051282051282, + "grad_norm": 125.96926879882812, + "learning_rate": 1e-06, + "loss": 0.1417, + "step": 314 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2990605980157852, + "epoch": 0.5048076923076923, + "grad_norm": 0.028910215944051743, + "learning_rate": 1e-06, + "loss": 0.1997, + "step": 315 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.31929296255111694, + "epoch": 0.5064102564102564, + "grad_norm": 114.29574584960938, + "learning_rate": 1e-06, + "loss": 0.1296, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6009.0, + "completions/mean_length": 4275.66015625, + "completions/mean_terminated_length": 1762.6085205078125, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 0.32325243949890137, + "epoch": 0.5080128205128205, + "frac_reward_zero_std": 0.0, + "grad_norm": 370.2263488769531, + "learning_rate": 1e-06, + "loss": 0.1346, + "num_tokens": 316649083.0, + "reward": 0.24201995134353638, + "reward_std": 0.18078970909118652, + "rewards/progression_diversity/mean": -0.06900203227996826, + "rewards/progression_diversity/std": 0.1568773090839386, + "rewards/symbolic_reward_accuracy/mean": 0.142578125, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.5694499015808105, + "rewards/symbolic_reward_partial_score/std": 0.31546229124069214, + "rewards/tag_count_reward/mean": -0.13671875, + "rewards/tag_count_reward/std": 0.3438861668109894, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0173547267913818, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 512.0, + "sampling/sampling_logp_difference/mean": 12.444093704223633, + "step": 317 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.2793115973472595, + "epoch": 0.5096153846153846, + "grad_norm": 7.101436614990234, + "learning_rate": 1e-06, + "loss": 0.2415, + "step": 318 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.2891732454299927, + "epoch": 0.5112179487179487, + "grad_norm": 0.021407851949334145, + "learning_rate": 1e-06, + "loss": 0.2313, + "step": 319 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.28606168925762177, + "epoch": 0.5128205128205128, + "grad_norm": 0.01678905449807644, + "learning_rate": 1e-06, + "loss": 0.2457, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4942.0, + "completions/mean_length": 3954.216796875, + "completions/mean_terminated_length": 1720.2926025390625, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "entropy": 0.2966272532939911, + "epoch": 0.5144230769230769, + "frac_reward_zero_std": 0.0, + "grad_norm": 758.809814453125, + "learning_rate": 1e-06, + "loss": 0.2448, + "num_tokens": 319542234.0, + "reward": 0.23621943593025208, + "reward_std": 0.17420727014541626, + "rewards/progression_diversity/mean": -0.062140051275491714, + "rewards/progression_diversity/std": 0.15392348170280457, + "rewards/symbolic_reward_accuracy/mean": 0.126953125, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.5752767324447632, + "rewards/symbolic_reward_partial_score/std": 0.3146626949310303, + "rewards/tag_count_reward/mean": -0.119140625, + "rewards/tag_count_reward/std": 0.32427072525024414, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0240211486816406, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 512.0, + "sampling/sampling_logp_difference/mean": 11.527631759643555, + "step": 321 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.33031274378299713, + "epoch": 0.5160256410256411, + "grad_norm": 0.021069565787911415, + "learning_rate": 1e-06, + "loss": 0.1706, + "step": 322 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.3213012218475342, + "epoch": 0.5176282051282052, + "grad_norm": 0.34873709082603455, + "learning_rate": 1e-06, + "loss": 0.1742, + "step": 323 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.30144381523132324, + "epoch": 0.5192307692307693, + "grad_norm": 0.02283627726137638, + "learning_rate": 1e-06, + "loss": 0.2115, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4747.0, + "completions/mean_length": 4465.17578125, + "completions/mean_terminated_length": 1854.3857421875, + "completions/min_length": 530.0, + "completions/min_terminated_length": 530.0, + "entropy": 0.32395896315574646, + "epoch": 0.5208333333333334, + "frac_reward_zero_std": 0.03125, + "grad_norm": 343.3765563964844, + "learning_rate": 1e-06, + "loss": 0.1178, + "num_tokens": 322758580.0, + "reward": 0.20879912376403809, + "reward_std": 0.1630467027425766, + "rewards/progression_diversity/mean": -0.07272521406412125, + "rewards/progression_diversity/std": 0.16325299441814423, + "rewards/symbolic_reward_accuracy/mean": 0.099609375, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.5441243648529053, + "rewards/symbolic_reward_partial_score/std": 0.3322213888168335, + "rewards/tag_count_reward/mean": -0.134765625, + "rewards/tag_count_reward/std": 0.3418070077896118, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0155704021453857, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 512.0, + "sampling/sampling_logp_difference/mean": 12.669414520263672, + "step": 325 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3014257103204727, + "epoch": 0.5224358974358975, + "grad_norm": 0.020518580451607704, + "learning_rate": 1e-06, + "loss": 0.2602, + "step": 326 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.29575052857398987, + "epoch": 0.5240384615384616, + "grad_norm": 530.0026245117188, + "learning_rate": 1e-06, + "loss": 0.2193, + "step": 327 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.3828125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.2522171512246132, + "epoch": 0.5256410256410257, + "grad_norm": 0.015645645558834076, + "learning_rate": 1e-06, + "loss": 0.3602, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5689.0, + "completions/mean_length": 4896.400390625, + "completions/mean_terminated_length": 1968.1888427734375, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.28659459948539734, + "epoch": 0.5272435897435898, + "frac_reward_zero_std": 0.0, + "grad_norm": 472.4561462402344, + "learning_rate": 1e-06, + "loss": 0.1946, + "num_tokens": 326107713.0, + "reward": 0.27258506417274475, + "reward_std": 0.22720691561698914, + "rewards/progression_diversity/mean": -0.0842689722776413, + "rewards/progression_diversity/std": 0.17504683136940002, + "rewards/symbolic_reward_accuracy/mean": 0.20703125, + "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, + "rewards/symbolic_reward_partial_score/mean": 0.554003894329071, + "rewards/symbolic_reward_partial_score/std": 0.3567145764827728, + "rewards/tag_count_reward/mean": -0.169921875, + "rewards/tag_count_reward/std": 0.3759314715862274, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.014817714691162, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 512.0, + "sampling/sampling_logp_difference/mean": 12.392074584960938, + "step": 329 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.2568201795220375, + "epoch": 0.5288461538461539, + "grad_norm": 0.02229255437850952, + "learning_rate": 1e-06, + "loss": 0.2685, + "step": 330 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.2770218998193741, + "epoch": 0.530448717948718, + "grad_norm": 0.020292697474360466, + "learning_rate": 1e-06, + "loss": 0.2106, + "step": 331 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.27906210720539093, + "epoch": 0.532051282051282, + "grad_norm": 0.01652703620493412, + "learning_rate": 1e-06, + "loss": 0.2365, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.240234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6400.0, + "completions/mean_length": 5536.8125, + "completions/mean_terminated_length": 2106.98193359375, + "completions/min_length": 382.0, + "completions/min_terminated_length": 382.0, + "entropy": 0.24760161340236664, + "epoch": 0.5336538461538461, + "frac_reward_zero_std": 0.0, + "grad_norm": 698.7391967773438, + "learning_rate": 1e-06, + "loss": 0.2491, + "num_tokens": 329775969.0, + "reward": 0.1934526264667511, + "reward_std": 0.1800420880317688, + "rewards/progression_diversity/mean": -0.09467939287424088, + "rewards/progression_diversity/std": 0.17836976051330566, + "rewards/symbolic_reward_accuracy/mean": 0.107421875, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.49760740995407104, + "rewards/symbolic_reward_partial_score/std": 0.3303731083869934, + "rewards/tag_count_reward/mean": -0.193359375, + "rewards/tag_count_reward/std": 0.39531853795051575, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0133090019226074, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 12.529105186462402, + "step": 333 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.27296438813209534, + "epoch": 0.5352564102564102, + "grad_norm": 238.9752960205078, + "learning_rate": 1e-06, + "loss": 0.2225, + "step": 334 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.248204804956913, + "epoch": 0.5368589743589743, + "grad_norm": 106.70677185058594, + "learning_rate": 1e-06, + "loss": 0.2809, + "step": 335 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.2727896124124527, + "epoch": 0.5384615384615384, + "grad_norm": 0.031822893768548965, + "learning_rate": 1e-06, + "loss": 0.2527, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5026.0, + "completions/mean_length": 5709.119140625, + "completions/mean_terminated_length": 2001.002685546875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.25325363129377365, + "epoch": 0.5400641025641025, + "frac_reward_zero_std": 0.0, + "grad_norm": 363.06060791015625, + "learning_rate": 1e-06, + "loss": 0.2604, + "num_tokens": 333527678.0, + "reward": 0.23530638217926025, + "reward_std": 0.22434313595294952, + "rewards/progression_diversity/mean": -0.1065697893500328, + "rewards/progression_diversity/std": 0.18764851987361908, + "rewards/symbolic_reward_accuracy/mean": 0.171875, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.5112141966819763, + "rewards/symbolic_reward_partial_score/std": 0.3657578229904175, + "rewards/tag_count_reward/mean": -0.201171875, + "rewards/tag_count_reward/std": 0.4012683033943176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.006093144416809, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 14.130290031433105, + "step": 337 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.24173004925251007, + "epoch": 0.5416666666666666, + "grad_norm": 319.6545715332031, + "learning_rate": 1e-06, + "loss": 0.2882, + "step": 338 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2622825503349304, + "epoch": 0.5432692307692307, + "grad_norm": 219.5463409423828, + "learning_rate": 1e-06, + "loss": 0.2093, + "step": 339 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.2560427188873291, + "epoch": 0.5448717948717948, + "grad_norm": 0.026842230930924416, + "learning_rate": 1e-06, + "loss": 0.2614, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.279296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5854.0, + "completions/mean_length": 6195.84765625, + "completions/mean_terminated_length": 2247.593505859375, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "entropy": 0.23094411194324493, + "epoch": 0.5464743589743589, + "frac_reward_zero_std": 0.0, + "grad_norm": 853.015625, + "learning_rate": 1e-06, + "loss": 0.3191, + "num_tokens": 337591040.0, + "reward": 0.1426115334033966, + "reward_std": 0.17017614841461182, + "rewards/progression_diversity/mean": -0.10945233702659607, + "rewards/progression_diversity/std": 0.18531769514083862, + "rewards/symbolic_reward_accuracy/mean": 0.064453125, + "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, + "rewards/symbolic_reward_partial_score/mean": 0.4288899898529053, + "rewards/symbolic_reward_partial_score/std": 0.330017626285553, + "rewards/tag_count_reward/mean": -0.236328125, + "rewards/tag_count_reward/std": 0.42524150013923645, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0073082447052002, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 13.80842399597168, + "step": 341 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.2576480060815811, + "epoch": 0.5480769230769231, + "grad_norm": 14.71423053741455, + "learning_rate": 1e-06, + "loss": 0.2245, + "step": 342 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.3515625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.2539537325501442, + "epoch": 0.5496794871794872, + "grad_norm": 0.01885385625064373, + "learning_rate": 1e-06, + "loss": 0.2631, + "step": 343 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.2588704973459244, + "epoch": 0.5512820512820513, + "grad_norm": 0.022954288870096207, + "learning_rate": 1e-06, + "loss": 0.2359, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5730.0, + "completions/mean_length": 5692.001953125, + "completions/mean_terminated_length": 2128.002685546875, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "entropy": 0.28620630502700806, + "epoch": 0.5528846153846154, + "frac_reward_zero_std": 0.0, + "grad_norm": 664.2322998046875, + "learning_rate": 1e-06, + "loss": 0.2053, + "num_tokens": 341301089.0, + "reward": 0.21951040625572205, + "reward_std": 0.22114171087741852, + "rewards/progression_diversity/mean": -0.09241549670696259, + "rewards/progression_diversity/std": 0.17231649160385132, + "rewards/symbolic_reward_accuracy/mean": 0.1484375, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.5049642324447632, + "rewards/symbolic_reward_partial_score/std": 0.3561265468597412, + "rewards/tag_count_reward/mean": -0.201171875, + "rewards/tag_count_reward/std": 0.4012683033943176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0092263221740723, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 13.769186019897461, + "step": 345 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.2516386955976486, + "epoch": 0.5544871794871795, + "grad_norm": 547.4846801757812, + "learning_rate": 1e-06, + "loss": 0.3033, + "step": 346 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.2582557052373886, + "epoch": 0.5560897435897436, + "grad_norm": 0.3687719404697418, + "learning_rate": 1e-06, + "loss": 0.2881, + "step": 347 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.25732411444187164, + "epoch": 0.5576923076923077, + "grad_norm": 0.022600045427680016, + "learning_rate": 1e-06, + "loss": 0.1909, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.23046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5114.0, + "completions/mean_length": 5463.091796875, + "completions/mean_terminated_length": 2192.36279296875, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "entropy": 0.2584301382303238, + "epoch": 0.5592948717948718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1072.4036865234375, + "learning_rate": 1e-06, + "loss": 0.2856, + "num_tokens": 344947984.0, + "reward": 0.17411813139915466, + "reward_std": 0.17677854001522064, + "rewards/progression_diversity/mean": -0.08086328208446503, + "rewards/progression_diversity/std": 0.1584118902683258, + "rewards/symbolic_reward_accuracy/mean": 0.083984375, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.4769693911075592, + "rewards/symbolic_reward_partial_score/std": 0.3174276053905487, + "rewards/tag_count_reward/mean": -0.185546875, + "rewards/tag_count_reward/std": 0.38912075757980347, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.015385389328003, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 12.646538734436035, + "step": 349 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.28206463158130646, + "epoch": 0.5608974358974359, + "grad_norm": 23.78731918334961, + "learning_rate": 1e-06, + "loss": 0.2093, + "step": 350 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2668316960334778, + "epoch": 0.5625, + "grad_norm": 0.019982578232884407, + "learning_rate": 1e-06, + "loss": 0.2188, + "step": 351 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.27186111360788345, + "epoch": 0.5641025641025641, + "grad_norm": 0.04332799091935158, + "learning_rate": 1e-06, + "loss": 0.2327, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.255859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5015.0, + "completions/mean_length": 5713.78515625, + "completions/mean_terminated_length": 2045.0235595703125, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 0.24826274812221527, + "epoch": 0.5657051282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 2501.171875, + "learning_rate": 1e-06, + "loss": 0.2487, + "num_tokens": 348796562.0, + "reward": 0.2261546105146408, + "reward_std": 0.22966080904006958, + "rewards/progression_diversity/mean": -0.09401117265224457, + "rewards/progression_diversity/std": 0.1731572151184082, + "rewards/symbolic_reward_accuracy/mean": 0.166015625, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.4926595091819763, + "rewards/symbolic_reward_partial_score/std": 0.3556332588195801, + "rewards/tag_count_reward/mean": -0.203125, + "rewards/tag_count_reward/std": 0.4027182459831238, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0071918964385986, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 14.334271430969238, + "step": 353 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.23615773022174835, + "epoch": 0.5673076923076923, + "grad_norm": 180.40103149414062, + "learning_rate": 1e-06, + "loss": 0.3324, + "step": 354 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.2617553621530533, + "epoch": 0.5689102564102564, + "grad_norm": 15.31356143951416, + "learning_rate": 1e-06, + "loss": 0.2533, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.28039678931236267, + "epoch": 0.5705128205128205, + "grad_norm": 0.03650696948170662, + "learning_rate": 1e-06, + "loss": 0.209, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.30859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6289.0, + "completions/mean_length": 6547.625, + "completions/mean_terminated_length": 2157.37841796875, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.2463904321193695, + "epoch": 0.5721153846153846, + "frac_reward_zero_std": 0.0, + "grad_norm": 776.6242065429688, + "learning_rate": 1e-06, + "loss": 0.2591, + "num_tokens": 353049346.0, + "reward": 0.15728139877319336, + "reward_std": 0.19684484601020813, + "rewards/progression_diversity/mean": -0.11365720629692078, + "rewards/progression_diversity/std": 0.1853574961423874, + "rewards/symbolic_reward_accuracy/mean": 0.083984375, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.4564453363418579, + "rewards/symbolic_reward_partial_score/std": 0.3446153700351715, + "rewards/tag_count_reward/mean": -0.2890625, + "rewards/tag_count_reward/std": 0.45377036929130554, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0021424293518066, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 15.470850944519043, + "step": 357 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.23972290009260178, + "epoch": 0.5737179487179487, + "grad_norm": 157.12100219726562, + "learning_rate": 1e-06, + "loss": 0.2861, + "step": 358 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.2611324340105057, + "epoch": 0.5753205128205128, + "grad_norm": 0.026206420734524727, + "learning_rate": 1e-06, + "loss": 0.2928, + "step": 359 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6328125, + "entropy": 0.2634492665529251, + "epoch": 0.5769230769230769, + "grad_norm": 6.0313310623168945, + "learning_rate": 1e-06, + "loss": 0.346, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5884.0, + "completions/mean_length": 6286.490234375, + "completions/mean_terminated_length": 2180.909423828125, + "completions/min_length": 398.0, + "completions/min_terminated_length": 398.0, + "entropy": 0.2577165514230728, + "epoch": 0.5785256410256411, + "frac_reward_zero_std": 0.0, + "grad_norm": 268.4336853027344, + "learning_rate": 1e-06, + "loss": 0.2247, + "num_tokens": 357130125.0, + "reward": 0.1658135950565338, + "reward_std": 0.19578394293785095, + "rewards/progression_diversity/mean": -0.116884745657444, + "rewards/progression_diversity/std": 0.19544658064842224, + "rewards/symbolic_reward_accuracy/mean": 0.09375, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.45634764432907104, + "rewards/symbolic_reward_partial_score/std": 0.3385258913040161, + "rewards/tag_count_reward/mean": -0.26171875, + "rewards/tag_count_reward/std": 0.44000017642974854, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001070261001587, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 15.70700454711914, + "step": 361 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.21536707878112793, + "epoch": 0.5801282051282052, + "grad_norm": 2007.2310791015625, + "learning_rate": 1e-06, + "loss": 0.3851, + "step": 362 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.59375, + "entropy": 0.23036614060401917, + "epoch": 0.5817307692307693, + "grad_norm": 188.44676208496094, + "learning_rate": 1e-06, + "loss": 0.3252, + "step": 363 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.24637820571660995, + "epoch": 0.5833333333333334, + "grad_norm": 0.025239424780011177, + "learning_rate": 1e-06, + "loss": 0.2805, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5414.0, + "completions/mean_length": 7036.28125, + "completions/mean_terminated_length": 2307.435302734375, + "completions/min_length": 542.0, + "completions/min_terminated_length": 542.0, + "entropy": 0.2505442202091217, + "epoch": 0.5849358974358975, + "frac_reward_zero_std": 0.0, + "grad_norm": 329.7117614746094, + "learning_rate": 1e-06, + "loss": 0.1805, + "num_tokens": 361515437.0, + "reward": 0.15802721679210663, + "reward_std": 0.16592083871364594, + "rewards/progression_diversity/mean": -0.14015081524848938, + "rewards/progression_diversity/std": 0.20965632796287537, + "rewards/symbolic_reward_accuracy/mean": 0.080078125, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.47153323888778687, + "rewards/symbolic_reward_partial_score/std": 0.3264097571372986, + "rewards/tag_count_reward/mean": -0.30078125, + "rewards/tag_count_reward/std": 0.45904624462127686, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9985228180885315, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 15.96454906463623, + "step": 365 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.21816373616456985, + "epoch": 0.5865384615384616, + "grad_norm": 1167.26611328125, + "learning_rate": 1e-06, + "loss": 0.2898, + "step": 366 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.3828125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6015625, + "entropy": 0.2148098573088646, + "epoch": 0.5881410256410257, + "grad_norm": 8.457894325256348, + "learning_rate": 1e-06, + "loss": 0.3274, + "step": 367 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.65625, + "entropy": 0.2301519215106964, + "epoch": 0.5897435897435898, + "grad_norm": 40.98564147949219, + "learning_rate": 1e-06, + "loss": 0.2575, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.380859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5608.0, + "completions/mean_length": 7724.8984375, + "completions/mean_terminated_length": 2398.32177734375, + "completions/min_length": 571.0, + "completions/min_terminated_length": 571.0, + "entropy": 0.20847928524017334, + "epoch": 0.5913461538461539, + "frac_reward_zero_std": 0.0, + "grad_norm": 825.861083984375, + "learning_rate": 1e-06, + "loss": 0.2632, + "num_tokens": 366384745.0, + "reward": 0.16988316178321838, + "reward_std": 0.19900578260421753, + "rewards/progression_diversity/mean": -0.1527988612651825, + "rewards/progression_diversity/std": 0.21360373497009277, + "rewards/symbolic_reward_accuracy/mean": 0.123046875, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.4359537959098816, + "rewards/symbolic_reward_partial_score/std": 0.35540899634361267, + "rewards/tag_count_reward/mean": -0.33203125, + "rewards/tag_count_reward/std": 0.47140273451805115, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913198947906494, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 16.873680114746094, + "step": 369 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.1968770995736122, + "epoch": 0.592948717948718, + "grad_norm": 528.3468627929688, + "learning_rate": 1e-06, + "loss": 0.3157, + "step": 370 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.3515625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.19093196094036102, + "epoch": 0.594551282051282, + "grad_norm": 962.9395751953125, + "learning_rate": 1e-06, + "loss": 0.3246, + "step": 371 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.20564716309309006, + "epoch": 0.5961538461538461, + "grad_norm": 17.06043243408203, + "learning_rate": 1e-06, + "loss": 0.281, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6353.0, + "completions/mean_length": 6398.890625, + "completions/mean_terminated_length": 2339.010986328125, + "completions/min_length": 467.0, + "completions/min_terminated_length": 467.0, + "entropy": 0.25001922249794006, + "epoch": 0.5977564102564102, + "frac_reward_zero_std": 0.0, + "grad_norm": 419.783447265625, + "learning_rate": 1e-06, + "loss": 0.2007, + "num_tokens": 370463153.0, + "reward": 0.15392863750457764, + "reward_std": 0.17637325823307037, + "rewards/progression_diversity/mean": -0.11885648965835571, + "rewards/progression_diversity/std": 0.2015119045972824, + "rewards/symbolic_reward_accuracy/mean": 0.052734375, + "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, + "rewards/symbolic_reward_partial_score/mean": 0.5014322996139526, + "rewards/symbolic_reward_partial_score/std": 0.3110915422439575, + "rewards/tag_count_reward/mean": -0.26953125, + "rewards/tag_count_reward/std": 0.44415023922920227, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9979949593544006, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 16.023746490478516, + "step": 373 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2377210631966591, + "epoch": 0.5993589743589743, + "grad_norm": 631.596435546875, + "learning_rate": 1e-06, + "loss": 0.3175, + "step": 374 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.2239432856440544, + "epoch": 0.6009615384615384, + "grad_norm": 676.9954223632812, + "learning_rate": 1e-06, + "loss": 0.307, + "step": 375 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.2474696934223175, + "epoch": 0.6025641025641025, + "grad_norm": 75.13766479492188, + "learning_rate": 1e-06, + "loss": 0.207, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.3046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6881.0, + "completions/mean_length": 6655.6796875, + "completions/mean_terminated_length": 2392.7080078125, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 0.2293773591518402, + "epoch": 0.6041666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 938.2958984375, + "learning_rate": 1e-06, + "loss": 0.2922, + "num_tokens": 374795517.0, + "reward": 0.16000205278396606, + "reward_std": 0.18099814653396606, + "rewards/progression_diversity/mean": -0.11991134285926819, + "rewards/progression_diversity/std": 0.19749855995178223, + "rewards/symbolic_reward_accuracy/mean": 0.08203125, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.4598633050918579, + "rewards/symbolic_reward_partial_score/std": 0.3230631649494171, + "rewards/tag_count_reward/mean": -0.259765625, + "rewards/tag_count_reward/std": 0.4389347732067108, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0005474090576172, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 15.338663101196289, + "step": 377 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.2615222707390785, + "epoch": 0.6057692307692307, + "grad_norm": 1569.6458740234375, + "learning_rate": 1e-06, + "loss": 0.1825, + "step": 378 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2232813537120819, + "epoch": 0.6073717948717948, + "grad_norm": 432.9651184082031, + "learning_rate": 1e-06, + "loss": 0.3715, + "step": 379 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.24861778318881989, + "epoch": 0.6089743589743589, + "grad_norm": 11.820572853088379, + "learning_rate": 1e-06, + "loss": 0.2359, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.31640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5300.0, + "completions/mean_length": 6716.076171875, + "completions/mean_terminated_length": 2241.20849609375, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.2513297498226166, + "epoch": 0.6105769230769231, + "frac_reward_zero_std": 0.0, + "grad_norm": 598.6245727539062, + "learning_rate": 1e-06, + "loss": 0.1991, + "num_tokens": 379059012.0, + "reward": 0.2126418650150299, + "reward_std": 0.20646947622299194, + "rewards/progression_diversity/mean": -0.1269288957118988, + "rewards/progression_diversity/std": 0.20075669884681702, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.4962402582168579, + "rewards/symbolic_reward_partial_score/std": 0.35096409916877747, + "rewards/tag_count_reward/mean": -0.287109375, + "rewards/tag_count_reward/std": 0.45285552740097046, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9986449480056763, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 15.811168670654297, + "step": 381 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2153405398130417, + "epoch": 0.6121794871794872, + "grad_norm": 698.4373779296875, + "learning_rate": 1e-06, + "loss": 0.4257, + "step": 382 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.2359524369239807, + "epoch": 0.6137820512820513, + "grad_norm": 251.32823181152344, + "learning_rate": 1e-06, + "loss": 0.2656, + "step": 383 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.2591070830821991, + "epoch": 0.6153846153846154, + "grad_norm": 1601.9156494140625, + "learning_rate": 1e-06, + "loss": 0.1929, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13378.0, + "completions/mean_length": 5822.728515625, + "completions/mean_terminated_length": 2302.3046875, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "entropy": 0.27983543276786804, + "epoch": 0.6169871794871795, + "frac_reward_zero_std": 0.0, + "grad_norm": 612.2117919921875, + "learning_rate": 1e-06, + "loss": 0.1586, + "num_tokens": 382808633.0, + "reward": 0.22261884808540344, + "reward_std": 0.21902015805244446, + "rewards/progression_diversity/mean": -0.10237417370080948, + "rewards/progression_diversity/std": 0.1874011605978012, + "rewards/symbolic_reward_accuracy/mean": 0.150390625, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.522167980670929, + "rewards/symbolic_reward_partial_score/std": 0.3341177999973297, + "rewards/tag_count_reward/mean": -0.232421875, + "rewards/tag_count_reward/std": 0.42278963327407837, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0090166330337524, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 13.79207706451416, + "step": 385 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.24246982485055923, + "epoch": 0.6185897435897436, + "grad_norm": 324.906005859375, + "learning_rate": 1e-06, + "loss": 0.3371, + "step": 386 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2653020918369293, + "epoch": 0.6201923076923077, + "grad_norm": 195.28736877441406, + "learning_rate": 1e-06, + "loss": 0.2096, + "step": 387 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.2655164748430252, + "epoch": 0.6217948717948718, + "grad_norm": 68.55646514892578, + "learning_rate": 1e-06, + "loss": 0.2088, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.33984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12824.0, + "completions/mean_length": 7204.025390625, + "completions/mean_terminated_length": 2478.23974609375, + "completions/min_length": 636.0, + "completions/min_terminated_length": 636.0, + "entropy": 0.21731575578451157, + "epoch": 0.6233974358974359, + "frac_reward_zero_std": 0.0, + "grad_norm": 1553.82080078125, + "learning_rate": 1e-06, + "loss": 0.3047, + "num_tokens": 387345190.0, + "reward": 0.16609683632850647, + "reward_std": 0.20702703297138214, + "rewards/progression_diversity/mean": -0.1368994563817978, + "rewards/progression_diversity/std": 0.20169976353645325, + "rewards/symbolic_reward_accuracy/mean": 0.10546875, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.44949543476104736, + "rewards/symbolic_reward_partial_score/std": 0.3339717984199524, + "rewards/tag_count_reward/mean": -0.306640625, + "rewards/tag_count_reward/std": 0.4615498185157776, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934464693069458, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 16.535369873046875, + "step": 389 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.4921875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.65625, + "entropy": 0.1892649158835411, + "epoch": 0.625, + "grad_norm": 104.23577880859375, + "learning_rate": 1e-06, + "loss": 0.348, + "step": 390 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.24794911593198776, + "epoch": 0.6266025641025641, + "grad_norm": 0.032321542501449585, + "learning_rate": 1e-06, + "loss": 0.1668, + "step": 391 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.21500948816537857, + "epoch": 0.6282051282051282, + "grad_norm": 1.6792036294937134, + "learning_rate": 1e-06, + "loss": 0.2748, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.314453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13686.0, + "completions/mean_length": 6848.802734375, + "completions/mean_terminated_length": 2475.108154296875, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "entropy": 0.23190298676490784, + "epoch": 0.6298076923076923, + "frac_reward_zero_std": 0.0, + "grad_norm": 609.8284301757812, + "learning_rate": 1e-06, + "loss": 0.2339, + "num_tokens": 391783393.0, + "reward": 0.14935705065727234, + "reward_std": 0.17916199564933777, + "rewards/progression_diversity/mean": -0.1165420189499855, + "rewards/progression_diversity/std": 0.18923556804656982, + "rewards/symbolic_reward_accuracy/mean": 0.078125, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.44054362177848816, + "rewards/symbolic_reward_partial_score/std": 0.3393227458000183, + "rewards/tag_count_reward/mean": -0.28515625, + "rewards/tag_count_reward/std": 0.45193037390708923, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.002633810043335, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 14.761566162109375, + "step": 393 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2784353643655777, + "epoch": 0.6314102564102564, + "grad_norm": 262.9246520996094, + "learning_rate": 1e-06, + "loss": 0.1924, + "step": 394 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.3984375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.65625, + "entropy": 0.21299445629119873, + "epoch": 0.6330128205128205, + "grad_norm": 244.2508087158203, + "learning_rate": 1e-06, + "loss": 0.3124, + "step": 395 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.65625, + "entropy": 0.22290880978107452, + "epoch": 0.6346153846153846, + "grad_norm": 0.022922182455658913, + "learning_rate": 1e-06, + "loss": 0.3096, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.322265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6123.0, + "completions/mean_length": 6933.5625, + "completions/mean_terminated_length": 2439.838623046875, + "completions/min_length": 505.0, + "completions/min_terminated_length": 505.0, + "entropy": 0.25610800087451935, + "epoch": 0.6362179487179487, + "frac_reward_zero_std": 0.0, + "grad_norm": 492.3603210449219, + "learning_rate": 1e-06, + "loss": 0.2415, + "num_tokens": 396120849.0, + "reward": 0.14940249919891357, + "reward_std": 0.18980038166046143, + "rewards/progression_diversity/mean": -0.124204620718956, + "rewards/progression_diversity/std": 0.19669218361377716, + "rewards/symbolic_reward_accuracy/mean": 0.08203125, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.43704426288604736, + "rewards/symbolic_reward_partial_score/std": 0.3477852940559387, + "rewards/tag_count_reward/mean": -0.296875, + "rewards/tag_count_reward/std": 0.45732781291007996, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0012847185134888, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 15.557909965515137, + "step": 397 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.23665452748537064, + "epoch": 0.6378205128205128, + "grad_norm": 1791.0264892578125, + "learning_rate": 1e-06, + "loss": 0.2836, + "step": 398 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.2314099296927452, + "epoch": 0.6394230769230769, + "grad_norm": 193.3421630859375, + "learning_rate": 1e-06, + "loss": 0.3197, + "step": 399 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.24476408958435059, + "epoch": 0.6410256410256411, + "grad_norm": 7.516962051391602, + "learning_rate": 1e-06, + "loss": 0.2566, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6095.0, + "completions/mean_length": 6450.169921875, + "completions/mean_terminated_length": 2411.14013671875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.23504739999771118, + "epoch": 0.6426282051282052, + "frac_reward_zero_std": 0.0, + "grad_norm": 792.7737426757812, + "learning_rate": 1e-06, + "loss": 0.306, + "num_tokens": 400280488.0, + "reward": 0.18703122437000275, + "reward_std": 0.192929208278656, + "rewards/progression_diversity/mean": -0.10351836681365967, + "rewards/progression_diversity/std": 0.17522378265857697, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.4745442867279053, + "rewards/symbolic_reward_partial_score/std": 0.3318890631198883, + "rewards/tag_count_reward/mean": -0.24609375, + "rewards/tag_count_reward/std": 0.4311550557613373, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0084717273712158, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 14.434258460998535, + "step": 401 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.25011158734560013, + "epoch": 0.6442307692307693, + "grad_norm": 317.9718322753906, + "learning_rate": 1e-06, + "loss": 0.2485, + "step": 402 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.261634424328804, + "epoch": 0.6458333333333334, + "grad_norm": 102.99897766113281, + "learning_rate": 1e-06, + "loss": 0.2611, + "step": 403 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.2772502601146698, + "epoch": 0.6474358974358975, + "grad_norm": 31.346302032470703, + "learning_rate": 1e-06, + "loss": 0.1701, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.255859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7054.0, + "completions/mean_length": 6076.56640625, + "completions/mean_terminated_length": 2532.540771484375, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "entropy": 0.2811954766511917, + "epoch": 0.6490384615384616, + "frac_reward_zero_std": 0.0, + "grad_norm": 863.798828125, + "learning_rate": 1e-06, + "loss": 0.2297, + "num_tokens": 404250186.0, + "reward": 0.1544865369796753, + "reward_std": 0.17089833319187164, + "rewards/progression_diversity/mean": -0.08845682442188263, + "rewards/progression_diversity/std": 0.16534774005413055, + "rewards/symbolic_reward_accuracy/mean": 0.05078125, + "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, + "rewards/symbolic_reward_partial_score/mean": 0.4886067509651184, + "rewards/symbolic_reward_partial_score/std": 0.3330814838409424, + "rewards/tag_count_reward/mean": -0.216796875, + "rewards/tag_count_reward/std": 0.4124660789966583, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0118536949157715, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 13.839906692504883, + "step": 405 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.269717812538147, + "epoch": 0.6506410256410257, + "grad_norm": 659.154052734375, + "learning_rate": 1e-06, + "loss": 0.2937, + "step": 406 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.25652148574590683, + "epoch": 0.6522435897435898, + "grad_norm": 0.023287350311875343, + "learning_rate": 1e-06, + "loss": 0.2854, + "step": 407 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3671875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6015625, + "entropy": 0.2793606072664261, + "epoch": 0.6538461538461539, + "grad_norm": 0.029362130910158157, + "learning_rate": 1e-06, + "loss": 0.2409, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7332.0, + "completions/mean_length": 6744.724609375, + "completions/mean_terminated_length": 2674.808349609375, + "completions/min_length": 504.0, + "completions/min_terminated_length": 504.0, + "entropy": 0.23652763664722443, + "epoch": 0.655448717948718, + "frac_reward_zero_std": 0.0, + "grad_norm": 488.6410827636719, + "learning_rate": 1e-06, + "loss": 0.3058, + "num_tokens": 408669469.0, + "reward": 0.1471533626317978, + "reward_std": 0.18105512857437134, + "rewards/progression_diversity/mean": -0.10253537446260452, + "rewards/progression_diversity/std": 0.17895787954330444, + "rewards/symbolic_reward_accuracy/mean": 0.076171875, + "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, + "rewards/symbolic_reward_partial_score/mean": 0.42687174677848816, + "rewards/symbolic_reward_partial_score/std": 0.3458375334739685, + "rewards/tag_count_reward/mean": -0.255859375, + "rewards/tag_count_reward/std": 0.43676990270614624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0076942443847656, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 520.0, + "sampling/sampling_logp_difference/mean": 13.969697952270508, + "step": 409 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.2513129413127899, + "epoch": 0.657051282051282, + "grad_norm": 781.8362426757812, + "learning_rate": 1e-06, + "loss": 0.3877, + "step": 410 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.640625, + "entropy": 0.2599927484989166, + "epoch": 0.6586538461538461, + "grad_norm": 8.609899520874023, + "learning_rate": 1e-06, + "loss": 0.2786, + "step": 411 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.3515625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.23813901096582413, + "epoch": 0.6602564102564102, + "grad_norm": 0.03463734686374664, + "learning_rate": 1e-06, + "loss": 0.3067, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.248046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13444.0, + "completions/mean_length": 6043.830078125, + "completions/mean_terminated_length": 2632.916748046875, + "completions/min_length": 596.0, + "completions/min_terminated_length": 596.0, + "entropy": 0.2797359824180603, + "epoch": 0.6618589743589743, + "frac_reward_zero_std": 0.0, + "grad_norm": 446.9456787109375, + "learning_rate": 1e-06, + "loss": 0.2367, + "num_tokens": 412617750.0, + "reward": 0.1771666407585144, + "reward_std": 0.18979674577713013, + "rewards/progression_diversity/mean": -0.08070126920938492, + "rewards/progression_diversity/std": 0.15634961426258087, + "rewards/symbolic_reward_accuracy/mean": 0.09375, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.4754069149494171, + "rewards/symbolic_reward_partial_score/std": 0.336232453584671, + "rewards/tag_count_reward/mean": -0.208984375, + "rewards/tag_count_reward/std": 0.40698084235191345, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0141136646270752, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 524.0, + "sampling/sampling_logp_difference/mean": 12.855762481689453, + "step": 413 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2539880573749542, + "epoch": 0.6634615384615384, + "grad_norm": 368.5082092285156, + "learning_rate": 1e-06, + "loss": 0.3098, + "step": 414 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2531498521566391, + "epoch": 0.6650641025641025, + "grad_norm": 162.2779541015625, + "learning_rate": 1e-06, + "loss": 0.2807, + "step": 415 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.2950977236032486, + "epoch": 0.6666666666666666, + "grad_norm": 129.06211853027344, + "learning_rate": 1e-06, + "loss": 0.1478, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.19921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12448.0, + "completions/mean_length": 5337.125, + "completions/mean_terminated_length": 2588.8779296875, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "entropy": 0.28342205286026, + "epoch": 0.6682692307692307, + "frac_reward_zero_std": 0.0, + "grad_norm": 795.7291259765625, + "learning_rate": 1e-06, + "loss": 0.2281, + "num_tokens": 416218758.0, + "reward": 0.18160498142242432, + "reward_std": 0.17491479218006134, + "rewards/progression_diversity/mean": -0.06606545299291611, + "rewards/progression_diversity/std": 0.14889149367809296, + "rewards/symbolic_reward_accuracy/mean": 0.087890625, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.48906251788139343, + "rewards/symbolic_reward_partial_score/std": 0.3222104609012604, + "rewards/tag_count_reward/mean": -0.171875, + "rewards/tag_count_reward/std": 0.3776407241821289, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0209629535675049, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 524.0, + "sampling/sampling_logp_difference/mean": 11.58393383026123, + "step": 417 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.29566076397895813, + "epoch": 0.6698717948717948, + "grad_norm": 879.3897094726562, + "learning_rate": 1e-06, + "loss": 0.2653, + "step": 418 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.2756947875022888, + "epoch": 0.6714743589743589, + "grad_norm": 92.3701400756836, + "learning_rate": 1e-06, + "loss": 0.2869, + "step": 419 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.31204286217689514, + "epoch": 0.6730769230769231, + "grad_norm": 0.028281621634960175, + "learning_rate": 1e-06, + "loss": 0.1507, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6618.0, + "completions/mean_length": 5795.564453125, + "completions/mean_terminated_length": 2693.901611328125, + "completions/min_length": 476.0, + "completions/min_terminated_length": 476.0, + "entropy": 0.28321386873722076, + "epoch": 0.6746794871794872, + "frac_reward_zero_std": 0.0, + "grad_norm": 1091.48193359375, + "learning_rate": 1e-06, + "loss": 0.2515, + "num_tokens": 420108631.0, + "reward": 0.20123516023159027, + "reward_std": 0.18499191105365753, + "rewards/progression_diversity/mean": -0.06691451370716095, + "rewards/progression_diversity/std": 0.14319713413715363, + "rewards/symbolic_reward_accuracy/mean": 0.130859375, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.47119140625, + "rewards/symbolic_reward_partial_score/std": 0.3467746376991272, + "rewards/tag_count_reward/mean": -0.1796875, + "rewards/tag_count_reward/std": 0.38430243730545044, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0261237621307373, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 524.0, + "sampling/sampling_logp_difference/mean": 10.4873685836792, + "step": 421 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.29514726996421814, + "epoch": 0.6762820512820513, + "grad_norm": 0.027155712246894836, + "learning_rate": 1e-06, + "loss": 0.1629, + "step": 422 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.27398838102817535, + "epoch": 0.6778846153846154, + "grad_norm": 0.03398658335208893, + "learning_rate": 1e-06, + "loss": 0.2472, + "step": 423 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.28561370074748993, + "epoch": 0.6794871794871795, + "grad_norm": 0.03232736140489578, + "learning_rate": 1e-06, + "loss": 0.1731, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8604.0, + "completions/mean_length": 6224.42578125, + "completions/mean_terminated_length": 2766.97900390625, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "entropy": 0.28223684430122375, + "epoch": 0.6810897435897436, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1497.0281982421875, + "learning_rate": 1e-06, + "loss": 0.2344, + "num_tokens": 424132385.0, + "reward": 0.1739288568496704, + "reward_std": 0.19503268599510193, + "rewards/progression_diversity/mean": -0.07684145867824554, + "rewards/progression_diversity/std": 0.15114691853523254, + "rewards/symbolic_reward_accuracy/mean": 0.09375, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.46123045682907104, + "rewards/symbolic_reward_partial_score/std": 0.3385530114173889, + "rewards/tag_count_reward/mean": -0.19921875, + "rewards/tag_count_reward/std": 0.39980348944664, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.015244483947754, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 524.0, + "sampling/sampling_logp_difference/mean": 12.87832260131836, + "step": 425 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.2601533979177475, + "epoch": 0.6826923076923077, + "grad_norm": 5.701667785644531, + "learning_rate": 1e-06, + "loss": 0.2746, + "step": 426 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.28981634974479675, + "epoch": 0.6842948717948718, + "grad_norm": 0.028896579518914223, + "learning_rate": 1e-06, + "loss": 0.1887, + "step": 427 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.3828125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.268518328666687, + "epoch": 0.6858974358974359, + "grad_norm": 0.03334295004606247, + "learning_rate": 1e-06, + "loss": 0.2642, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.267578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8233.0, + "completions/mean_length": 6416.837890625, + "completions/mean_terminated_length": 2775.501220703125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.26624637842178345, + "epoch": 0.6875, + "frac_reward_zero_std": 0.0, + "grad_norm": 960.3082885742188, + "learning_rate": 1e-06, + "loss": 0.2586, + "num_tokens": 428286878.0, + "reward": 0.19910672307014465, + "reward_std": 0.20500460267066956, + "rewards/progression_diversity/mean": -0.07907611131668091, + "rewards/progression_diversity/std": 0.1532817780971527, + "rewards/symbolic_reward_accuracy/mean": 0.130859375, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.4729654788970947, + "rewards/symbolic_reward_partial_score/std": 0.36421647667884827, + "rewards/tag_count_reward/mean": -0.205078125, + "rewards/tag_count_reward/std": 0.4041535556316376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0138148069381714, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 524.0, + "sampling/sampling_logp_difference/mean": 12.851956367492676, + "step": 429 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2550949156284332, + "epoch": 0.6891025641025641, + "grad_norm": 0.16806720197200775, + "learning_rate": 1e-06, + "loss": 0.2759, + "step": 430 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.28539833426475525, + "epoch": 0.6907051282051282, + "grad_norm": 0.027446668595075607, + "learning_rate": 1e-06, + "loss": 0.2034, + "step": 431 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.2528308480978012, + "epoch": 0.6923076923076923, + "grad_norm": 0.4935199022293091, + "learning_rate": 1e-06, + "loss": 0.2769, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.22265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6021.0, + "completions/mean_length": 5849.77734375, + "completions/mean_terminated_length": 2832.437255859375, + "completions/min_length": 514.0, + "completions/min_terminated_length": 514.0, + "entropy": 0.2618635445833206, + "epoch": 0.6939102564102564, + "frac_reward_zero_std": 0.0, + "grad_norm": 2757.423828125, + "learning_rate": 1e-06, + "loss": 0.2911, + "num_tokens": 432106236.0, + "reward": 0.14727163314819336, + "reward_std": 0.16309772431850433, + "rewards/progression_diversity/mean": -0.06434153020381927, + "rewards/progression_diversity/std": 0.13670898973941803, + "rewards/symbolic_reward_accuracy/mean": 0.0546875, + "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, + "rewards/symbolic_reward_partial_score/mean": 0.44357097148895264, + "rewards/symbolic_reward_partial_score/std": 0.31754741072654724, + "rewards/tag_count_reward/mean": -0.1796875, + "rewards/tag_count_reward/std": 0.38430243730545044, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.018375039100647, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 524.0, + "sampling/sampling_logp_difference/mean": 12.233776092529297, + "step": 433 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.2820604145526886, + "epoch": 0.6955128205128205, + "grad_norm": 319.56304931640625, + "learning_rate": 1e-06, + "loss": 0.2209, + "step": 434 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.2742318958044052, + "epoch": 0.6971153846153846, + "grad_norm": 0.03161383047699928, + "learning_rate": 1e-06, + "loss": 0.232, + "step": 435 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.28273947536945343, + "epoch": 0.6987179487179487, + "grad_norm": 0.10165081173181534, + "learning_rate": 1e-06, + "loss": 0.194, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13592.0, + "completions/mean_length": 5897.6796875, + "completions/mean_terminated_length": 2825.92919921875, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "entropy": 0.2722629904747009, + "epoch": 0.7003205128205128, + "frac_reward_zero_std": 0.0, + "grad_norm": 247.05191040039062, + "learning_rate": 1e-06, + "loss": 0.2686, + "num_tokens": 435945464.0, + "reward": 0.17971304059028625, + "reward_std": 0.16216953098773956, + "rewards/progression_diversity/mean": -0.043344639241695404, + "rewards/progression_diversity/std": 0.10168621689081192, + "rewards/symbolic_reward_accuracy/mean": 0.08984375, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.4813476502895355, + "rewards/symbolic_reward_partial_score/std": 0.332892507314682, + "rewards/tag_count_reward/mean": -0.181640625, + "rewards/tag_count_reward/std": 0.38592514395713806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0249509811401367, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 524.0, + "sampling/sampling_logp_difference/mean": 9.850525856018066, + "step": 437 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.2895769029855728, + "epoch": 0.7019230769230769, + "grad_norm": 77.53079986572266, + "learning_rate": 1e-06, + "loss": 0.2535, + "step": 438 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3120630830526352, + "epoch": 0.7035256410256411, + "grad_norm": 0.035726454108953476, + "learning_rate": 1e-06, + "loss": 0.2106, + "step": 439 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3188182860612869, + "epoch": 0.7051282051282052, + "grad_norm": 26.437782287597656, + "learning_rate": 1e-06, + "loss": 0.2773, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.20703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15335.0, + "completions/mean_length": 5722.31640625, + "completions/mean_terminated_length": 2938.72412109375, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "entropy": 0.31573159992694855, + "epoch": 0.7067307692307693, + "frac_reward_zero_std": 0.0, + "grad_norm": 219.33189392089844, + "learning_rate": 1e-06, + "loss": 0.2088, + "num_tokens": 439743898.0, + "reward": 0.16550663113594055, + "reward_std": 0.17046445608139038, + "rewards/progression_diversity/mean": -0.00011906892905244604, + "rewards/progression_diversity/std": 0.002519554691389203, + "rewards/symbolic_reward_accuracy/mean": 0.068359375, + "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, + "rewards/symbolic_reward_partial_score/mean": 0.4709635376930237, + "rewards/symbolic_reward_partial_score/std": 0.31855061650276184, + "rewards/tag_count_reward/mean": -0.16796875, + "rewards/tag_count_reward/std": 0.374204158782959, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0505764484405518, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 1.717551350593567, + "step": 441 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.3088010251522064, + "epoch": 0.7083333333333334, + "grad_norm": 0.03410027176141739, + "learning_rate": 1e-06, + "loss": 0.1864, + "step": 442 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.3080977499485016, + "epoch": 0.7099358974358975, + "grad_norm": 0.03793443366885185, + "learning_rate": 1e-06, + "loss": 0.2849, + "step": 443 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.3152337819337845, + "epoch": 0.7115384615384616, + "grad_norm": 0.02681701071560383, + "learning_rate": 1e-06, + "loss": 0.2083, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.240234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7462.0, + "completions/mean_length": 6347.04296875, + "completions/mean_terminated_length": 3173.403564453125, + "completions/min_length": 535.0, + "completions/min_terminated_length": 535.0, + "entropy": 0.2724430710077286, + "epoch": 0.7131410256410257, + "frac_reward_zero_std": 0.0, + "grad_norm": 317.01702880859375, + "learning_rate": 1e-06, + "loss": 0.2599, + "num_tokens": 443837808.0, + "reward": 0.15419574081897736, + "reward_std": 0.1597881019115448, + "rewards/progression_diversity/mean": -0.0023014158941805363, + "rewards/progression_diversity/std": 0.015193020924925804, + "rewards/symbolic_reward_accuracy/mean": 0.060546875, + "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, + "rewards/symbolic_reward_partial_score/mean": 0.4554687738418579, + "rewards/symbolic_reward_partial_score/std": 0.3217701315879822, + "rewards/tag_count_reward/mean": -0.1875, + "rewards/tag_count_reward/std": 0.39069411158561707, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0374995470046997, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 482.0, + "sampling/sampling_logp_difference/mean": 2.7787814140319824, + "step": 445 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.3045268654823303, + "epoch": 0.7147435897435898, + "grad_norm": 0.030019085854291916, + "learning_rate": 1e-06, + "loss": 0.178, + "step": 446 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.26429812610149384, + "epoch": 0.7163461538461539, + "grad_norm": 0.038714099675416946, + "learning_rate": 1e-06, + "loss": 0.3328, + "step": 447 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.28812122344970703, + "epoch": 0.717948717948718, + "grad_norm": 0.03327601030468941, + "learning_rate": 1e-06, + "loss": 0.2087, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.25, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8403.0, + "completions/mean_length": 6429.9140625, + "completions/mean_terminated_length": 3111.885498046875, + "completions/min_length": 725.0, + "completions/min_terminated_length": 725.0, + "entropy": 0.27920031547546387, + "epoch": 0.719551282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 255.1264190673828, + "learning_rate": 1e-06, + "loss": 0.2454, + "num_tokens": 448087220.0, + "reward": 0.14131547510623932, + "reward_std": 0.1714935302734375, + "rewards/progression_diversity/mean": -0.005660384893417358, + "rewards/progression_diversity/std": 0.01929020695388317, + "rewards/symbolic_reward_accuracy/mean": 0.052734375, + "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, + "rewards/symbolic_reward_partial_score/mean": 0.43413084745407104, + "rewards/symbolic_reward_partial_score/std": 0.3264063894748688, + "rewards/tag_count_reward/mean": -0.205078125, + "rewards/tag_count_reward/std": 0.4041535556316376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0356756448745728, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 454.0, + "sampling/sampling_logp_difference/mean": 3.235456943511963, + "step": 449 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.27578917145729065, + "epoch": 0.7211538461538461, + "grad_norm": 1.0962835550308228, + "learning_rate": 1e-06, + "loss": 0.2381, + "step": 450 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.2769124358892441, + "epoch": 0.7227564102564102, + "grad_norm": 0.03778436407446861, + "learning_rate": 1e-06, + "loss": 0.2373, + "step": 451 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.25797323882579803, + "epoch": 0.7243589743589743, + "grad_norm": 0.07790114730596542, + "learning_rate": 1e-06, + "loss": 0.2823, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.23046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13926.0, + "completions/mean_length": 6445.306640625, + "completions/mean_terminated_length": 3468.74365234375, + "completions/min_length": 525.0, + "completions/min_terminated_length": 525.0, + "entropy": 0.2812911868095398, + "epoch": 0.7259615384615384, + "frac_reward_zero_std": 0.0, + "grad_norm": 242.18006896972656, + "learning_rate": 1e-06, + "loss": 0.2006, + "num_tokens": 452275297.0, + "reward": 0.14035208523273468, + "reward_std": 0.1568242758512497, + "rewards/progression_diversity/mean": -0.00873718224465847, + "rewards/progression_diversity/std": 0.02793550305068493, + "rewards/symbolic_reward_accuracy/mean": 0.0546875, + "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, + "rewards/symbolic_reward_partial_score/mean": 0.42190754413604736, + "rewards/symbolic_reward_partial_score/std": 0.31549105048179626, + "rewards/tag_count_reward/mean": -0.189453125, + "rewards/tag_count_reward/std": 0.3922513723373413, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0444124937057495, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 400.0, + "sampling/sampling_logp_difference/mean": 3.6778717041015625, + "step": 453 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2804671823978424, + "epoch": 0.7275641025641025, + "grad_norm": 0.040219251066446304, + "learning_rate": 1e-06, + "loss": 0.1904, + "step": 454 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.26130877435207367, + "epoch": 0.7291666666666666, + "grad_norm": 0.05419059470295906, + "learning_rate": 1e-06, + "loss": 0.2466, + "step": 455 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.2619156539440155, + "epoch": 0.7307692307692307, + "grad_norm": 6.435583114624023, + "learning_rate": 1e-06, + "loss": 0.3236, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.205078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7699.0, + "completions/mean_length": 6036.04296875, + "completions/mean_terminated_length": 3366.422607421875, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "entropy": 0.30352291464805603, + "epoch": 0.7323717948717948, + "frac_reward_zero_std": 0.0, + "grad_norm": 363.8023681640625, + "learning_rate": 1e-06, + "loss": 0.1275, + "num_tokens": 456286823.0, + "reward": 0.13116447627544403, + "reward_std": 0.14602042734622955, + "rewards/progression_diversity/mean": -0.00855403020977974, + "rewards/progression_diversity/std": 0.028096795082092285, + "rewards/symbolic_reward_accuracy/mean": 0.03515625, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.4192708432674408, + "rewards/symbolic_reward_partial_score/std": 0.30005842447280884, + "rewards/tag_count_reward/mean": -0.15625, + "rewards/tag_count_reward/std": 0.36344730854034424, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0444647073745728, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 430.0, + "sampling/sampling_logp_difference/mean": 5.1553144454956055, + "step": 457 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2788728326559067, + "epoch": 0.7339743589743589, + "grad_norm": 0.03529650345444679, + "learning_rate": 1e-06, + "loss": 0.2712, + "step": 458 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2527826279401779, + "epoch": 0.7355769230769231, + "grad_norm": 0.02542269416153431, + "learning_rate": 1e-06, + "loss": 0.3004, + "step": 459 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.27167366445064545, + "epoch": 0.7371794871794872, + "grad_norm": 0.02993706800043583, + "learning_rate": 1e-06, + "loss": 0.2269, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.185546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7170.0, + "completions/mean_length": 5713.298828125, + "completions/mean_terminated_length": 3282.32373046875, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.2820378243923187, + "epoch": 0.7387820512820513, + "frac_reward_zero_std": 0.0, + "grad_norm": 278.36627197265625, + "learning_rate": 1e-06, + "loss": 0.1824, + "num_tokens": 460071312.0, + "reward": 0.17337031662464142, + "reward_std": 0.17275133728981018, + "rewards/progression_diversity/mean": -0.004277626518160105, + "rewards/progression_diversity/std": 0.01868487522006035, + "rewards/symbolic_reward_accuracy/mean": 0.072265625, + "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, + "rewards/symbolic_reward_partial_score/mean": 0.485595703125, + "rewards/symbolic_reward_partial_score/std": 0.31720206141471863, + "rewards/tag_count_reward/mean": -0.15625, + "rewards/tag_count_reward/std": 0.36344730854034424, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0428156852722168, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 456.0, + "sampling/sampling_logp_difference/mean": 5.391382217407227, + "step": 461 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2974891811609268, + "epoch": 0.7403846153846154, + "grad_norm": 0.03358296677470207, + "learning_rate": 1e-06, + "loss": 0.1633, + "step": 462 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.27032434195280075, + "epoch": 0.7419871794871795, + "grad_norm": 0.03503183275461197, + "learning_rate": 1e-06, + "loss": 0.2628, + "step": 463 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2741468995809555, + "epoch": 0.7435897435897436, + "grad_norm": 0.047111768275499344, + "learning_rate": 1e-06, + "loss": 0.2698, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.224609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15885.0, + "completions/mean_length": 6353.427734375, + "completions/mean_terminated_length": 3447.84619140625, + "completions/min_length": 490.0, + "completions/min_terminated_length": 490.0, + "entropy": 0.259844645857811, + "epoch": 0.7451923076923077, + "frac_reward_zero_std": 0.0, + "grad_norm": 823.7109985351562, + "learning_rate": 1e-06, + "loss": 0.2066, + "num_tokens": 464320411.0, + "reward": 0.13118945062160492, + "reward_std": 0.14172431826591492, + "rewards/progression_diversity/mean": -0.009473828598856926, + "rewards/progression_diversity/std": 0.026718810200691223, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.42329102754592896, + "rewards/symbolic_reward_partial_score/std": 0.2852300703525543, + "rewards/tag_count_reward/mean": -0.19140625, + "rewards/tag_count_reward/std": 0.3937928080558777, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0352518558502197, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 482.0, + "sampling/sampling_logp_difference/mean": 8.388681411743164, + "step": 465 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.3671875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.609375, + "entropy": 0.25056569278240204, + "epoch": 0.7467948717948718, + "grad_norm": 0.02025044709444046, + "learning_rate": 1e-06, + "loss": 0.2388, + "step": 466 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.25776257365942, + "epoch": 0.7483974358974359, + "grad_norm": 0.026951203122735023, + "learning_rate": 1e-06, + "loss": 0.248, + "step": 467 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.25720326602458954, + "epoch": 0.75, + "grad_norm": 0.03187187388539314, + "learning_rate": 1e-06, + "loss": 0.2192, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.154296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12745.0, + "completions/mean_length": 5212.98828125, + "completions/mean_terminated_length": 3174.859130859375, + "completions/min_length": 439.0, + "completions/min_terminated_length": 439.0, + "entropy": 0.28627626597881317, + "epoch": 0.7516025641025641, + "frac_reward_zero_std": 0.0, + "grad_norm": 330.2890930175781, + "learning_rate": 1e-06, + "loss": 0.1308, + "num_tokens": 467844069.0, + "reward": 0.1364753395318985, + "reward_std": 0.13440468907356262, + "rewards/progression_diversity/mean": -0.009204398840665817, + "rewards/progression_diversity/std": 0.028541414067149162, + "rewards/symbolic_reward_accuracy/mean": 0.021484375, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.4545735716819763, + "rewards/symbolic_reward_partial_score/std": 0.28182342648506165, + "rewards/tag_count_reward/mean": -0.126953125, + "rewards/tag_count_reward/std": 0.33324605226516724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0359776020050049, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 500.0, + "sampling/sampling_logp_difference/mean": 9.518253326416016, + "step": 469 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2785252183675766, + "epoch": 0.7532051282051282, + "grad_norm": 0.04603856801986694, + "learning_rate": 1e-06, + "loss": 0.226, + "step": 470 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.28009602427482605, + "epoch": 0.7548076923076923, + "grad_norm": 0.03989138454198837, + "learning_rate": 1e-06, + "loss": 0.1848, + "step": 471 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.28221395611763, + "epoch": 0.7564102564102564, + "grad_norm": 0.04469291865825653, + "learning_rate": 1e-06, + "loss": 0.1911, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7659.0, + "completions/mean_length": 5128.966796875, + "completions/mean_terminated_length": 3106.1728515625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.26698917895555496, + "epoch": 0.7580128205128205, + "frac_reward_zero_std": 0.0, + "grad_norm": 593.4261474609375, + "learning_rate": 1e-06, + "loss": 0.2374, + "num_tokens": 471326436.0, + "reward": 0.17709538340568542, + "reward_std": 0.15860500931739807, + "rewards/progression_diversity/mean": -0.008724970743060112, + "rewards/progression_diversity/std": 0.03066168539226055, + "rewards/symbolic_reward_accuracy/mean": 0.07421875, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.48448890447616577, + "rewards/symbolic_reward_partial_score/std": 0.2948387861251831, + "rewards/tag_count_reward/mean": -0.126953125, + "rewards/tag_count_reward/std": 0.33324605226516724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0322108268737793, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 516.0, + "sampling/sampling_logp_difference/mean": 10.949844360351562, + "step": 473 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.28319013118743896, + "epoch": 0.7596153846153846, + "grad_norm": 0.02427232451736927, + "learning_rate": 1e-06, + "loss": 0.196, + "step": 474 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.29725903272628784, + "epoch": 0.7612179487179487, + "grad_norm": 0.03828584402799606, + "learning_rate": 1e-06, + "loss": 0.1165, + "step": 475 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.2925011217594147, + "epoch": 0.7628205128205128, + "grad_norm": 0.030051294714212418, + "learning_rate": 1e-06, + "loss": 0.1457, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.173828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6983.0, + "completions/mean_length": 5560.11328125, + "completions/mean_terminated_length": 3282.7470703125, + "completions/min_length": 565.0, + "completions/min_terminated_length": 565.0, + "entropy": 0.2824954092502594, + "epoch": 0.7644230769230769, + "frac_reward_zero_std": 0.0, + "grad_norm": 1322.20654296875, + "learning_rate": 1e-06, + "loss": 0.126, + "num_tokens": 475077582.0, + "reward": 0.1587892770767212, + "reward_std": 0.146462082862854, + "rewards/progression_diversity/mean": -0.010721936821937561, + "rewards/progression_diversity/std": 0.032617803663015366, + "rewards/symbolic_reward_accuracy/mean": 0.0625, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.45152994990348816, + "rewards/symbolic_reward_partial_score/std": 0.2799621820449829, + "rewards/tag_count_reward/mean": -0.140625, + "rewards/tag_count_reward/std": 0.3479743003845215, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0238516330718994, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 532.0, + "sampling/sampling_logp_difference/mean": 12.856636047363281, + "step": 477 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.24741245806217194, + "epoch": 0.7660256410256411, + "grad_norm": 0.03778098151087761, + "learning_rate": 1e-06, + "loss": 0.2498, + "step": 478 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.26974865794181824, + "epoch": 0.7676282051282052, + "grad_norm": 0.03347004950046539, + "learning_rate": 1e-06, + "loss": 0.193, + "step": 479 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2672978490591049, + "epoch": 0.7692307692307693, + "grad_norm": 7.450833320617676, + "learning_rate": 1e-06, + "loss": 0.1819, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15833.0, + "completions/mean_length": 5183.740234375, + "completions/mean_terminated_length": 3109.6181640625, + "completions/min_length": 548.0, + "completions/min_terminated_length": 548.0, + "entropy": 0.27999909222126007, + "epoch": 0.7708333333333334, + "frac_reward_zero_std": 0.0, + "grad_norm": 328.314208984375, + "learning_rate": 1e-06, + "loss": 0.1661, + "num_tokens": 478652985.0, + "reward": 0.16447407007217407, + "reward_std": 0.16247986257076263, + "rewards/progression_diversity/mean": -0.009626220911741257, + "rewards/progression_diversity/std": 0.03150993958115578, + "rewards/symbolic_reward_accuracy/mean": 0.0625, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.4671875238418579, + "rewards/symbolic_reward_partial_score/std": 0.2948901653289795, + "rewards/tag_count_reward/mean": -0.130859375, + "rewards/tag_count_reward/std": 0.33757632970809937, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0216107368469238, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 544.0, + "sampling/sampling_logp_difference/mean": 14.160472869873047, + "step": 481 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.27840910851955414, + "epoch": 0.7724358974358975, + "grad_norm": 0.05339457094669342, + "learning_rate": 1e-06, + "loss": 0.1271, + "step": 482 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.2735966593027115, + "epoch": 0.7740384615384616, + "grad_norm": 0.03152226284146309, + "learning_rate": 1e-06, + "loss": 0.2084, + "step": 483 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.27197548747062683, + "epoch": 0.7756410256410257, + "grad_norm": 0.027287054806947708, + "learning_rate": 1e-06, + "loss": 0.1888, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6238.0, + "completions/mean_length": 5183.2421875, + "completions/mean_terminated_length": 3109.02783203125, + "completions/min_length": 741.0, + "completions/min_terminated_length": 741.0, + "entropy": 0.2760499268770218, + "epoch": 0.7772435897435898, + "frac_reward_zero_std": 0.0, + "grad_norm": 610.1857299804688, + "learning_rate": 1e-06, + "loss": 0.1777, + "num_tokens": 482058533.0, + "reward": 0.16969150304794312, + "reward_std": 0.13450489938259125, + "rewards/progression_diversity/mean": -0.0122962836176157, + "rewards/progression_diversity/std": 0.034512363374233246, + "rewards/symbolic_reward_accuracy/mean": 0.06640625, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.47880858182907104, + "rewards/symbolic_reward_partial_score/std": 0.29609718918800354, + "rewards/tag_count_reward/mean": -0.13671875, + "rewards/tag_count_reward/std": 0.3438861668109894, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.017279863357544, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 560.0, + "sampling/sampling_logp_difference/mean": 15.952376365661621, + "step": 485 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2726978659629822, + "epoch": 0.7788461538461539, + "grad_norm": 0.029111366719007492, + "learning_rate": 1e-06, + "loss": 0.222, + "step": 486 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.29166853427886963, + "epoch": 0.780448717948718, + "grad_norm": 0.03601072356104851, + "learning_rate": 1e-06, + "loss": 0.1032, + "step": 487 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.2872645705938339, + "epoch": 0.782051282051282, + "grad_norm": 0.02848963811993599, + "learning_rate": 1e-06, + "loss": 0.1847, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.12109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13892.0, + "completions/mean_length": 4681.236328125, + "completions/mean_terminated_length": 3068.855712890625, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "entropy": 0.2787110507488251, + "epoch": 0.7836538461538461, + "frac_reward_zero_std": 0.0, + "grad_norm": 408.6405944824219, + "learning_rate": 1e-06, + "loss": 0.2056, + "num_tokens": 485320734.0, + "reward": 0.17335151135921478, + "reward_std": 0.15505962073802948, + "rewards/progression_diversity/mean": -0.010063882917165756, + "rewards/progression_diversity/std": 0.03297411650419235, + "rewards/symbolic_reward_accuracy/mean": 0.05859375, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.49549156427383423, + "rewards/symbolic_reward_partial_score/std": 0.2642830014228821, + "rewards/tag_count_reward/mean": -0.103515625, + "rewards/tag_count_reward/std": 0.30492907762527466, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0186336040496826, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 576.0, + "sampling/sampling_logp_difference/mean": 15.60879135131836, + "step": 489 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.28752225637435913, + "epoch": 0.7852564102564102, + "grad_norm": 0.027989018708467484, + "learning_rate": 1e-06, + "loss": 0.1339, + "step": 490 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.28790904581546783, + "epoch": 0.7868589743589743, + "grad_norm": 0.028247352689504623, + "learning_rate": 1e-06, + "loss": 0.1596, + "step": 491 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.2850726991891861, + "epoch": 0.7884615384615384, + "grad_norm": 0.030095672234892845, + "learning_rate": 1e-06, + "loss": 0.1193, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6460.0, + "completions/mean_length": 4684.54296875, + "completions/mean_terminated_length": 3247.767578125, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "entropy": 0.2784768342971802, + "epoch": 0.7900641025641025, + "frac_reward_zero_std": 0.0, + "grad_norm": 769.5609741210938, + "learning_rate": 1e-06, + "loss": 0.1766, + "num_tokens": 488602388.0, + "reward": 0.16056574881076813, + "reward_std": 0.1394985318183899, + "rewards/progression_diversity/mean": -0.008366829715669155, + "rewards/progression_diversity/std": 0.0340796634554863, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.4892740845680237, + "rewards/symbolic_reward_partial_score/std": 0.26915544271469116, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0246632099151611, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 592.0, + "sampling/sampling_logp_difference/mean": 13.432557106018066, + "step": 493 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2812363803386688, + "epoch": 0.7916666666666666, + "grad_norm": 0.02783721312880516, + "learning_rate": 1e-06, + "loss": 0.1083, + "step": 494 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2817336320877075, + "epoch": 0.7932692307692307, + "grad_norm": 0.027454564347863197, + "learning_rate": 1e-06, + "loss": 0.1656, + "step": 495 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.28926005959510803, + "epoch": 0.7948717948717948, + "grad_norm": 0.052017685025930405, + "learning_rate": 1e-06, + "loss": 0.0861, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6732.0, + "completions/mean_length": 4107.0703125, + "completions/mean_terminated_length": 3038.378173828125, + "completions/min_length": 598.0, + "completions/min_terminated_length": 598.0, + "entropy": 0.31334738433361053, + "epoch": 0.7964743589743589, + "frac_reward_zero_std": 0.0, + "grad_norm": 333.04046630859375, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 491578936.0, + "reward": 0.1752394437789917, + "reward_std": 0.12819209694862366, + "rewards/progression_diversity/mean": -0.006330978125333786, + "rewards/progression_diversity/std": 0.025279799476265907, + "rewards/symbolic_reward_accuracy/mean": 0.04296875, + "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, + "rewards/symbolic_reward_partial_score/mean": 0.5198893547058105, + "rewards/symbolic_reward_partial_score/std": 0.25531089305877686, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0289983749389648, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 608.0, + "sampling/sampling_logp_difference/mean": 12.241405487060547, + "step": 497 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.27990762889385223, + "epoch": 0.7980769230769231, + "grad_norm": 0.03255309909582138, + "learning_rate": 1e-06, + "loss": 0.1956, + "step": 498 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2794528156518936, + "epoch": 0.7996794871794872, + "grad_norm": 0.02955903671681881, + "learning_rate": 1e-06, + "loss": 0.1626, + "step": 499 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.28499020636081696, + "epoch": 0.8012820512820513, + "grad_norm": 0.031167522072792053, + "learning_rate": 1e-06, + "loss": 0.1355, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7721.0, + "completions/mean_length": 4657.025390625, + "completions/mean_terminated_length": 3216.87060546875, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "entropy": 0.2538497596979141, + "epoch": 0.8028846153846154, + "frac_reward_zero_std": 0.0, + "grad_norm": 414.44403076171875, + "learning_rate": 1e-06, + "loss": 0.1954, + "num_tokens": 494832389.0, + "reward": 0.20476049184799194, + "reward_std": 0.15316171944141388, + "rewards/progression_diversity/mean": -0.008816054090857506, + "rewards/progression_diversity/std": 0.03022286854684353, + "rewards/symbolic_reward_accuracy/mean": 0.107421875, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.49988603591918945, + "rewards/symbolic_reward_partial_score/std": 0.2798597812652588, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.017579197883606, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 616.0, + "sampling/sampling_logp_difference/mean": 14.574539184570312, + "step": 501 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.26197493076324463, + "epoch": 0.8044871794871795, + "grad_norm": 209.1951141357422, + "learning_rate": 1e-06, + "loss": 0.1276, + "step": 502 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.26250365376472473, + "epoch": 0.8060897435897436, + "grad_norm": 0.026032600551843643, + "learning_rate": 1e-06, + "loss": 0.1871, + "step": 503 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.28040294349193573, + "epoch": 0.8076923076923077, + "grad_norm": 0.0345035195350647, + "learning_rate": 1e-06, + "loss": 0.0733, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6329.0, + "completions/mean_length": 4636.939453125, + "completions/mean_terminated_length": 3194.318115234375, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 0.2512836679816246, + "epoch": 0.8092948717948718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1516.8017578125, + "learning_rate": 1e-06, + "loss": 0.2065, + "num_tokens": 498112726.0, + "reward": 0.14471670985221863, + "reward_std": 0.1163174957036972, + "rewards/progression_diversity/mean": -0.007822653278708458, + "rewards/progression_diversity/std": 0.027946053072810173, + "rewards/symbolic_reward_accuracy/mean": 0.01953125, + "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, + "rewards/symbolic_reward_partial_score/mean": 0.4754882752895355, + "rewards/symbolic_reward_partial_score/std": 0.25455576181411743, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0157849788665771, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 628.0, + "sampling/sampling_logp_difference/mean": 15.629146575927734, + "step": 505 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2723834812641144, + "epoch": 0.8108974358974359, + "grad_norm": 0.023276617750525475, + "learning_rate": 1e-06, + "loss": 0.0898, + "step": 506 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.24540285021066666, + "epoch": 0.8125, + "grad_norm": 2.523991584777832, + "learning_rate": 1e-06, + "loss": 0.1915, + "step": 507 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.27531081438064575, + "epoch": 0.8141025641025641, + "grad_norm": 0.03945612907409668, + "learning_rate": 1e-06, + "loss": 0.0928, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7280.0, + "completions/mean_length": 4566.736328125, + "completions/mean_terminated_length": 3230.87158203125, + "completions/min_length": 496.0, + "completions/min_terminated_length": 496.0, + "entropy": 0.25356655567884445, + "epoch": 0.8157051282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 2822.289306640625, + "learning_rate": 1e-06, + "loss": 0.2041, + "num_tokens": 501219247.0, + "reward": 0.21896670758724213, + "reward_std": 0.16303659975528717, + "rewards/progression_diversity/mean": -0.00762702152132988, + "rewards/progression_diversity/std": 0.029197612777352333, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.5237630605697632, + "rewards/symbolic_reward_partial_score/std": 0.2946349084377289, + "rewards/tag_count_reward/mean": -0.083984375, + "rewards/tag_count_reward/std": 0.2776356339454651, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.023378610610962, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 636.0, + "sampling/sampling_logp_difference/mean": 13.583820343017578, + "step": 509 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.28210754692554474, + "epoch": 0.8173076923076923, + "grad_norm": 0.023770952597260475, + "learning_rate": 1e-06, + "loss": 0.0548, + "step": 510 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.26374469697475433, + "epoch": 0.8189102564102564, + "grad_norm": 0.039423782378435135, + "learning_rate": 1e-06, + "loss": 0.1377, + "step": 511 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2674047648906708, + "epoch": 0.8205128205128205, + "grad_norm": 0.0316784493625164, + "learning_rate": 1e-06, + "loss": 0.1403, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6836.0, + "completions/mean_length": 5036.466796875, + "completions/mean_terminated_length": 3327.9619140625, + "completions/min_length": 756.0, + "completions/min_terminated_length": 756.0, + "entropy": 0.24934273958206177, + "epoch": 0.8221153846153846, + "frac_reward_zero_std": 0.0, + "grad_norm": 2330.71728515625, + "learning_rate": 1e-06, + "loss": 0.1066, + "num_tokens": 504675182.0, + "reward": 0.14850318431854248, + "reward_std": 0.10997183620929718, + "rewards/progression_diversity/mean": -0.01491626538336277, + "rewards/progression_diversity/std": 0.04920973256230354, + "rewards/symbolic_reward_accuracy/mean": 0.02734375, + "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, + "rewards/symbolic_reward_partial_score/mean": 0.4779297113418579, + "rewards/symbolic_reward_partial_score/std": 0.262747585773468, + "rewards/tag_count_reward/mean": -0.111328125, + "rewards/tag_count_reward/std": 0.31484565138816833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.004473090171814, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 640.0, + "sampling/sampling_logp_difference/mean": 19.370677947998047, + "step": 513 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.22383376955986023, + "epoch": 0.8237179487179487, + "grad_norm": 0.028715649619698524, + "learning_rate": 1e-06, + "loss": 0.2759, + "step": 514 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.23630475997924805, + "epoch": 0.8253205128205128, + "grad_norm": 0.03143737465143204, + "learning_rate": 1e-06, + "loss": 0.2012, + "step": 515 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.25393127650022507, + "epoch": 0.8269230769230769, + "grad_norm": 0.031186457723379135, + "learning_rate": 1e-06, + "loss": 0.0771, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6311.0, + "completions/mean_length": 4890.7109375, + "completions/mean_terminated_length": 3248.812744140625, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 0.25201961398124695, + "epoch": 0.8285256410256411, + "frac_reward_zero_std": 0.0, + "grad_norm": 291.3205261230469, + "learning_rate": 1e-06, + "loss": 0.1009, + "num_tokens": 508036634.0, + "reward": 0.1587275266647339, + "reward_std": 0.1489463448524475, + "rewards/progression_diversity/mean": -0.014943530783057213, + "rewards/progression_diversity/std": 0.049044571816921234, + "rewards/symbolic_reward_accuracy/mean": 0.04296875, + "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, + "rewards/symbolic_reward_partial_score/mean": 0.47750651836395264, + "rewards/symbolic_reward_partial_score/std": 0.25351831316947937, + "rewards/tag_count_reward/mean": -0.1015625, + "rewards/tag_count_reward/std": 0.30236753821372986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0078704357147217, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 640.0, + "sampling/sampling_logp_difference/mean": 17.498640060424805, + "step": 517 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.249831885099411, + "epoch": 0.8301282051282052, + "grad_norm": 133.84860229492188, + "learning_rate": 1e-06, + "loss": 0.1318, + "step": 518 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2366134375333786, + "epoch": 0.8317307692307693, + "grad_norm": 0.042024604976177216, + "learning_rate": 1e-06, + "loss": 0.1665, + "step": 519 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.23418694734573364, + "epoch": 0.8333333333333334, + "grad_norm": 0.023398801684379578, + "learning_rate": 1e-06, + "loss": 0.1972, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.12109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8086.0, + "completions/mean_length": 4970.57421875, + "completions/mean_terminated_length": 3398.057861328125, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.24243058264255524, + "epoch": 0.8349358974358975, + "frac_reward_zero_std": 0.0, + "grad_norm": 2872.773193359375, + "learning_rate": 1e-06, + "loss": 0.1181, + "num_tokens": 511476240.0, + "reward": 0.14858070015907288, + "reward_std": 0.11485429108142853, + "rewards/progression_diversity/mean": -0.017418432980775833, + "rewards/progression_diversity/std": 0.056952327489852905, + "rewards/symbolic_reward_accuracy/mean": 0.033203125, + "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, + "rewards/symbolic_reward_partial_score/mean": 0.4639485478401184, + "rewards/symbolic_reward_partial_score/std": 0.2507692277431488, + "rewards/tag_count_reward/mean": -0.103515625, + "rewards/tag_count_reward/std": 0.30492907762527466, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.008687973022461, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 648.0, + "sampling/sampling_logp_difference/mean": 16.52926254272461, + "step": 521 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.23126275092363358, + "epoch": 0.8365384615384616, + "grad_norm": 10394.248046875, + "learning_rate": 1e-06, + "loss": 1.4171, + "step": 522 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2367812767624855, + "epoch": 0.8381410256410257, + "grad_norm": 2017.530517578125, + "learning_rate": 1e-06, + "loss": 0.5716, + "step": 523 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.21209017932415009, + "epoch": 0.8397435897435898, + "grad_norm": 100393.40625, + "learning_rate": 1e-06, + "loss": 8.2323, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8267.0, + "completions/mean_length": 4735.314453125, + "completions/mean_terminated_length": 3418.50634765625, + "completions/min_length": 669.0, + "completions/min_terminated_length": 669.0, + "entropy": 0.24131416529417038, + "epoch": 0.8413461538461539, + "frac_reward_zero_std": 0.0, + "grad_norm": 1308.060302734375, + "learning_rate": 1e-06, + "loss": 0.0936, + "num_tokens": 514695857.0, + "reward": 0.18877416849136353, + "reward_std": 0.15265323221683502, + "rewards/progression_diversity/mean": -0.018579598516225815, + "rewards/progression_diversity/std": 0.0655272826552391, + "rewards/symbolic_reward_accuracy/mean": 0.078125, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.5009602904319763, + "rewards/symbolic_reward_partial_score/std": 0.28437086939811707, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0153002738952637, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 656.0, + "sampling/sampling_logp_difference/mean": 14.62424087524414, + "step": 525 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2314378321170807, + "epoch": 0.842948717948718, + "grad_norm": 2592.095703125, + "learning_rate": 1e-06, + "loss": 0.7228, + "step": 526 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24521462619304657, + "epoch": 0.844551282051282, + "grad_norm": 108445.40625, + "learning_rate": 1e-06, + "loss": 21.7594, + "step": 527 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.23320768773555756, + "epoch": 0.8461538461538461, + "grad_norm": 3017.028076171875, + "learning_rate": 1e-06, + "loss": 0.684, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6846.0, + "completions/mean_length": 5188.595703125, + "completions/mean_terminated_length": 3502.99560546875, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 0.23708263784646988, + "epoch": 0.8477564102564102, + "frac_reward_zero_std": 0.0, + "grad_norm": 436.8207092285156, + "learning_rate": 1e-06, + "loss": 0.03, + "num_tokens": 518277074.0, + "reward": 0.16587716341018677, + "reward_std": 0.13942670822143555, + "rewards/progression_diversity/mean": -0.018729275092482567, + "rewards/progression_diversity/std": 0.06120576709508896, + "rewards/symbolic_reward_accuracy/mean": 0.05078125, + "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, + "rewards/symbolic_reward_partial_score/mean": 0.4884440302848816, + "rewards/symbolic_reward_partial_score/std": 0.27574726939201355, + "rewards/tag_count_reward/mean": -0.109375, + "rewards/tag_count_reward/std": 0.31241437792778015, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0088396072387695, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 668.0, + "sampling/sampling_logp_difference/mean": 15.507166862487793, + "step": 529 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2199697643518448, + "epoch": 0.8493589743589743, + "grad_norm": 0.035212092101573944, + "learning_rate": 1e-06, + "loss": 0.1937, + "step": 530 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.22101637721061707, + "epoch": 0.8509615384615384, + "grad_norm": 0.045379314571619034, + "learning_rate": 1e-06, + "loss": 0.1429, + "step": 531 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.18882086127996445, + "epoch": 0.8525641025641025, + "grad_norm": 0.029790926724672318, + "learning_rate": 1e-06, + "loss": 0.2997, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7388.0, + "completions/mean_length": 4781.244140625, + "completions/mean_terminated_length": 3327.712158203125, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "entropy": 0.24219012260437012, + "epoch": 0.8541666666666666, + "frac_reward_zero_std": 0.0, + "grad_norm": 1252.3094482421875, + "learning_rate": 1e-06, + "loss": 0.0937, + "num_tokens": 521490175.0, + "reward": 0.19383737444877625, + "reward_std": 0.18049459159374237, + "rewards/progression_diversity/mean": -0.01616620644927025, + "rewards/progression_diversity/std": 0.057862453162670135, + "rewards/symbolic_reward_accuracy/mean": 0.0859375, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.5060384273529053, + "rewards/symbolic_reward_partial_score/std": 0.28731000423431396, + "rewards/tag_count_reward/mean": -0.09375, + "rewards/tag_count_reward/std": 0.29176566004753113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0143693685531616, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 680.0, + "sampling/sampling_logp_difference/mean": 14.317965507507324, + "step": 533 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.22050420194864273, + "epoch": 0.8557692307692307, + "grad_norm": 0.0265819001942873, + "learning_rate": 1e-06, + "loss": 0.17, + "step": 534 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.23473266512155533, + "epoch": 0.8573717948717948, + "grad_norm": 0.02770194783806801, + "learning_rate": 1e-06, + "loss": 0.1197, + "step": 535 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23300082236528397, + "epoch": 0.8589743589743589, + "grad_norm": 0.03747548907995224, + "learning_rate": 1e-06, + "loss": 0.1186, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14631.0, + "completions/mean_length": 5389.201171875, + "completions/mean_terminated_length": 3733.804443359375, + "completions/min_length": 973.0, + "completions/min_terminated_length": 973.0, + "entropy": 0.20381084084510803, + "epoch": 0.8605769230769231, + "frac_reward_zero_std": 0.0, + "grad_norm": 805.9613037109375, + "learning_rate": 1e-06, + "loss": 0.1618, + "num_tokens": 525175254.0, + "reward": 0.14221106469631195, + "reward_std": 0.12302105128765106, + "rewards/progression_diversity/mean": -0.0181512963026762, + "rewards/progression_diversity/std": 0.06077505275607109, + "rewards/symbolic_reward_accuracy/mean": 0.029296875, + "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, + "rewards/symbolic_reward_partial_score/mean": 0.44860023260116577, + "rewards/symbolic_reward_partial_score/std": 0.24405843019485474, + "rewards/tag_count_reward/mean": -0.09765625, + "rewards/tag_count_reward/std": 0.29713961482048035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.005043387413025, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 680.0, + "sampling/sampling_logp_difference/mean": 17.193443298339844, + "step": 537 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.20633187144994736, + "epoch": 0.8621794871794872, + "grad_norm": 0.02656022645533085, + "learning_rate": 1e-06, + "loss": 0.13, + "step": 538 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2009679228067398, + "epoch": 0.8637820512820513, + "grad_norm": 0.0328981950879097, + "learning_rate": 1e-06, + "loss": 0.2296, + "step": 539 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.22686323523521423, + "epoch": 0.8653846153846154, + "grad_norm": 0.034096311777830124, + "learning_rate": 1e-06, + "loss": 0.1219, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8314.0, + "completions/mean_length": 5590.28515625, + "completions/mean_terminated_length": 3650.40087890625, + "completions/min_length": 556.0, + "completions/min_terminated_length": 556.0, + "entropy": 0.18773535639047623, + "epoch": 0.8669871794871795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1792.0411376953125, + "learning_rate": 1e-06, + "loss": 0.3456, + "num_tokens": 528906344.0, + "reward": 0.12807568907737732, + "reward_std": 0.12272882461547852, + "rewards/progression_diversity/mean": -0.020069371908903122, + "rewards/progression_diversity/std": 0.0620153546333313, + "rewards/symbolic_reward_accuracy/mean": 0.017578125, + "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, + "rewards/symbolic_reward_partial_score/mean": 0.4406087398529053, + "rewards/symbolic_reward_partial_score/std": 0.2600206136703491, + "rewards/tag_count_reward/mean": -0.14453125, + "rewards/tag_count_reward/std": 0.35197147727012634, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.999688982963562, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 684.0, + "sampling/sampling_logp_difference/mean": 18.892946243286133, + "step": 541 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2230483591556549, + "epoch": 0.8685897435897436, + "grad_norm": 0.030025260522961617, + "learning_rate": 1e-06, + "loss": 0.0836, + "step": 542 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.21129625290632248, + "epoch": 0.8701923076923077, + "grad_norm": 0.029054047539830208, + "learning_rate": 1e-06, + "loss": 0.1741, + "step": 543 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.21284447610378265, + "epoch": 0.8717948717948718, + "grad_norm": 0.03237487003207207, + "learning_rate": 1e-06, + "loss": 0.1292, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.146484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7899.0, + "completions/mean_length": 5516.55078125, + "completions/mean_terminated_length": 3651.427734375, + "completions/min_length": 890.0, + "completions/min_terminated_length": 890.0, + "entropy": 0.22282154858112335, + "epoch": 0.8733974358974359, + "frac_reward_zero_std": 0.0, + "grad_norm": 214.0397186279297, + "learning_rate": 1e-06, + "loss": 0.0983, + "num_tokens": 532566914.0, + "reward": 0.13807618618011475, + "reward_std": 0.11777358502149582, + "rewards/progression_diversity/mean": -0.013183235190808773, + "rewards/progression_diversity/std": 0.04880344495177269, + "rewards/symbolic_reward_accuracy/mean": 0.01953125, + "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, + "rewards/symbolic_reward_partial_score/mean": 0.4639485776424408, + "rewards/symbolic_reward_partial_score/std": 0.25486207008361816, + "rewards/tag_count_reward/mean": -0.126953125, + "rewards/tag_count_reward/std": 0.33324605226516724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0068590641021729, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 684.0, + "sampling/sampling_logp_difference/mean": 16.14547348022461, + "step": 545 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.20825005322694778, + "epoch": 0.875, + "grad_norm": 0.026716945692896843, + "learning_rate": 1e-06, + "loss": 0.1561, + "step": 546 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.19253433495759964, + "epoch": 0.8766025641025641, + "grad_norm": 0.5219494104385376, + "learning_rate": 1e-06, + "loss": 0.2689, + "step": 547 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.21415498107671738, + "epoch": 0.8782051282051282, + "grad_norm": 0.0328250527381897, + "learning_rate": 1e-06, + "loss": 0.1751, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.12109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7527.0, + "completions/mean_length": 5125.4140625, + "completions/mean_terminated_length": 3574.231201171875, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "entropy": 0.20042593777179718, + "epoch": 0.8798076923076923, + "frac_reward_zero_std": 0.0, + "grad_norm": 668.1113891601562, + "learning_rate": 1e-06, + "loss": 0.172, + "num_tokens": 536149046.0, + "reward": 0.15214072167873383, + "reward_std": 0.13333392143249512, + "rewards/progression_diversity/mean": -0.01493225246667862, + "rewards/progression_diversity/std": 0.061123333871364594, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.4685709476470947, + "rewards/symbolic_reward_partial_score/std": 0.26284727454185486, + "rewards/tag_count_reward/mean": -0.1171875, + "rewards/tag_count_reward/std": 0.32195815443992615, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.012796401977539, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 684.0, + "sampling/sampling_logp_difference/mean": 12.92884349822998, + "step": 549 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.21232055872678757, + "epoch": 0.8814102564102564, + "grad_norm": 0.02926452085375786, + "learning_rate": 1e-06, + "loss": 0.1224, + "step": 550 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.20979397743940353, + "epoch": 0.8830128205128205, + "grad_norm": 0.02844247967004776, + "learning_rate": 1e-06, + "loss": 0.1311, + "step": 551 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.1974465250968933, + "epoch": 0.8846153846153846, + "grad_norm": 0.9733312129974365, + "learning_rate": 1e-06, + "loss": 0.2059, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.107421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13505.0, + "completions/mean_length": 5052.701171875, + "completions/mean_terminated_length": 3688.978271484375, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.2104191780090332, + "epoch": 0.8862179487179487, + "frac_reward_zero_std": 0.0, + "grad_norm": 485.2064514160156, + "learning_rate": 1e-06, + "loss": 0.1146, + "num_tokens": 539713629.0, + "reward": 0.15813249349594116, + "reward_std": 0.12568315863609314, + "rewards/progression_diversity/mean": -0.006576837040483952, + "rewards/progression_diversity/std": 0.032705530524253845, + "rewards/symbolic_reward_accuracy/mean": 0.03515625, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.4889160096645355, + "rewards/symbolic_reward_partial_score/std": 0.24816220998764038, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0161933898925781, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 684.0, + "sampling/sampling_logp_difference/mean": 11.277822494506836, + "step": 553 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.20011521875858307, + "epoch": 0.8878205128205128, + "grad_norm": 170.6609344482422, + "learning_rate": 1e-06, + "loss": 0.1556, + "step": 554 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.19730040431022644, + "epoch": 0.8894230769230769, + "grad_norm": 0.03489186614751816, + "learning_rate": 1e-06, + "loss": 0.1825, + "step": 555 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2150089591741562, + "epoch": 0.8910256410256411, + "grad_norm": 2.223613977432251, + "learning_rate": 1e-06, + "loss": 0.1191, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.146484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7381.0, + "completions/mean_length": 5520.578125, + "completions/mean_terminated_length": 3656.146240234375, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.21337107568979263, + "epoch": 0.8926282051282052, + "frac_reward_zero_std": 0.0, + "grad_norm": 2374.782470703125, + "learning_rate": 1e-06, + "loss": 0.0962, + "num_tokens": 543413141.0, + "reward": 0.1618734747171402, + "reward_std": 0.1597273349761963, + "rewards/progression_diversity/mean": -0.014801505953073502, + "rewards/progression_diversity/std": 0.047020554542541504, + "rewards/symbolic_reward_accuracy/mean": 0.06640625, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.44892579317092896, + "rewards/symbolic_reward_partial_score/std": 0.27913251519203186, + "rewards/tag_count_reward/mean": -0.125, + "rewards/tag_count_reward/std": 0.3310423493385315, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9986110925674438, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 688.0, + "sampling/sampling_logp_difference/mean": 18.847614288330078, + "step": 557 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.18713711947202682, + "epoch": 0.8942307692307693, + "grad_norm": 461.1898193359375, + "learning_rate": 1e-06, + "loss": 0.229, + "step": 558 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.19299649447202682, + "epoch": 0.8958333333333334, + "grad_norm": 0.025606518611311913, + "learning_rate": 1e-06, + "loss": 0.2068, + "step": 559 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.1916685476899147, + "epoch": 0.8974358974358975, + "grad_norm": 0.03450322896242142, + "learning_rate": 1e-06, + "loss": 0.1615, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8600.0, + "completions/mean_length": 5390.927734375, + "completions/mean_terminated_length": 3735.791015625, + "completions/min_length": 774.0, + "completions/min_terminated_length": 774.0, + "entropy": 0.20245271921157837, + "epoch": 0.8990384615384616, + "frac_reward_zero_std": 0.0, + "grad_norm": 967.3348999023438, + "learning_rate": 1e-06, + "loss": 0.1123, + "num_tokens": 547056576.0, + "reward": 0.13580447435379028, + "reward_std": 0.12063577026128769, + "rewards/progression_diversity/mean": -0.0128158088773489, + "rewards/progression_diversity/std": 0.04528747498989105, + "rewards/symbolic_reward_accuracy/mean": 0.015625, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.4648274779319763, + "rewards/symbolic_reward_partial_score/std": 0.2545829117298126, + "rewards/tag_count_reward/mean": -0.12890625, + "rewards/tag_count_reward/std": 0.33542385697364807, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0074546337127686, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 696.0, + "sampling/sampling_logp_difference/mean": 15.149176597595215, + "step": 561 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.1998009830713272, + "epoch": 0.9006410256410257, + "grad_norm": 0.021788692101836205, + "learning_rate": 1e-06, + "loss": 0.1464, + "step": 562 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.1879931315779686, + "epoch": 0.9022435897435898, + "grad_norm": 0.03801717236638069, + "learning_rate": 1e-06, + "loss": 0.2225, + "step": 563 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.19708418101072311, + "epoch": 0.9038461538461539, + "grad_norm": 0.028179530054330826, + "learning_rate": 1e-06, + "loss": 0.199, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.123046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7721.0, + "completions/mean_length": 5477.939453125, + "completions/mean_terminated_length": 3947.6904296875, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "entropy": 0.1928836703300476, + "epoch": 0.905448717948718, + "frac_reward_zero_std": 0.0, + "grad_norm": 1236.47900390625, + "learning_rate": 1e-06, + "loss": 0.1333, + "num_tokens": 550782689.0, + "reward": 0.1491735279560089, + "reward_std": 0.13501743972301483, + "rewards/progression_diversity/mean": -0.01135750487446785, + "rewards/progression_diversity/std": 0.042249299585819244, + "rewards/symbolic_reward_accuracy/mean": 0.04296875, + "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, + "rewards/symbolic_reward_partial_score/mean": 0.4500976502895355, + "rewards/symbolic_reward_partial_score/std": 0.2508525252342224, + "rewards/tag_count_reward/mean": -0.115234375, + "rewards/tag_count_reward/std": 0.3196168541908264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.008751392364502, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 696.0, + "sampling/sampling_logp_difference/mean": 14.606096267700195, + "step": 565 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.19845175743103027, + "epoch": 0.907051282051282, + "grad_norm": 32.34842300415039, + "learning_rate": 1e-06, + "loss": 0.1143, + "step": 566 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.185062974691391, + "epoch": 0.9086538461538461, + "grad_norm": 26.105823516845703, + "learning_rate": 1e-06, + "loss": 0.2236, + "step": 567 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.190422885119915, + "epoch": 0.9102564102564102, + "grad_norm": 0.02777501754462719, + "learning_rate": 1e-06, + "loss": 0.1594, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.115234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8955.0, + "completions/mean_length": 5115.3125, + "completions/mean_terminated_length": 3647.646728515625, + "completions/min_length": 683.0, + "completions/min_terminated_length": 683.0, + "entropy": 0.18562601506710052, + "epoch": 0.9118589743589743, + "frac_reward_zero_std": 0.0, + "grad_norm": 2351.0830078125, + "learning_rate": 1e-06, + "loss": 0.1649, + "num_tokens": 554328513.0, + "reward": 0.1948603093624115, + "reward_std": 0.1654394268989563, + "rewards/progression_diversity/mean": -0.011040431447327137, + "rewards/progression_diversity/std": 0.044010140001773834, + "rewards/symbolic_reward_accuracy/mean": 0.091796875, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.49755859375, + "rewards/symbolic_reward_partial_score/std": 0.2728171944618225, + "rewards/tag_count_reward/mean": -0.09375, + "rewards/tag_count_reward/std": 0.29176566004753113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0104902982711792, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 700.0, + "sampling/sampling_logp_difference/mean": 13.494155883789062, + "step": 569 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.20058371126651764, + "epoch": 0.9134615384615384, + "grad_norm": 0.0499381348490715, + "learning_rate": 1e-06, + "loss": 0.1804, + "step": 570 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.19234994798898697, + "epoch": 0.9150641025641025, + "grad_norm": 0.04124247282743454, + "learning_rate": 1e-06, + "loss": 0.1146, + "step": 571 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.20797397196292877, + "epoch": 0.9166666666666666, + "grad_norm": 0.028247077018022537, + "learning_rate": 1e-06, + "loss": 0.0817, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10550.0, + "completions/mean_length": 4636.544921875, + "completions/mean_terminated_length": 3694.765625, + "completions/min_length": 733.0, + "completions/min_terminated_length": 733.0, + "entropy": 0.21711117029190063, + "epoch": 0.9182692307692307, + "frac_reward_zero_std": 0.0, + "grad_norm": 234.86900329589844, + "learning_rate": 1e-06, + "loss": 0.0689, + "num_tokens": 557529192.0, + "reward": 0.20868007838726044, + "reward_std": 0.16052421927452087, + "rewards/progression_diversity/mean": -0.0065048919059336185, + "rewards/progression_diversity/std": 0.03668885678052902, + "rewards/symbolic_reward_accuracy/mean": 0.1015625, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.5161295533180237, + "rewards/symbolic_reward_partial_score/std": 0.28020334243774414, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0254935026168823, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 700.0, + "sampling/sampling_logp_difference/mean": 8.34628963470459, + "step": 573 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.20679805427789688, + "epoch": 0.9198717948717948, + "grad_norm": 0.06306414306163788, + "learning_rate": 1e-06, + "loss": 0.1743, + "step": 574 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.21248594671487808, + "epoch": 0.9214743589743589, + "grad_norm": 0.033059295266866684, + "learning_rate": 1e-06, + "loss": 0.1313, + "step": 575 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.21614834666252136, + "epoch": 0.9230769230769231, + "grad_norm": 0.023236755281686783, + "learning_rate": 1e-06, + "loss": 0.0916, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7749.0, + "completions/mean_length": 5531.244140625, + "completions/mean_terminated_length": 3897.233642578125, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "entropy": 0.1956375539302826, + "epoch": 0.9246794871794872, + "frac_reward_zero_std": 0.0, + "grad_norm": 377.7513122558594, + "learning_rate": 1e-06, + "loss": 0.0834, + "num_tokens": 561189509.0, + "reward": 0.15623126924037933, + "reward_std": 0.13696351647377014, + "rewards/progression_diversity/mean": -0.006757441908121109, + "rewards/progression_diversity/std": 0.02823774889111519, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.4793294072151184, + "rewards/symbolic_reward_partial_score/std": 0.26099953055381775, + "rewards/tag_count_reward/mean": -0.109375, + "rewards/tag_count_reward/std": 0.31241437792778015, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0143134593963623, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 704.0, + "sampling/sampling_logp_difference/mean": 11.73034381866455, + "step": 577 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.19732258468866348, + "epoch": 0.9262820512820513, + "grad_norm": 0.030800944194197655, + "learning_rate": 1e-06, + "loss": 0.1879, + "step": 578 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.19447887688875198, + "epoch": 0.9278846153846154, + "grad_norm": 0.03231598064303398, + "learning_rate": 1e-06, + "loss": 0.1604, + "step": 579 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.19631733745336533, + "epoch": 0.9294871794871795, + "grad_norm": 0.0303302314132452, + "learning_rate": 1e-06, + "loss": 0.176, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.119140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10004.0, + "completions/mean_length": 5372.26953125, + "completions/mean_terminated_length": 3882.8779296875, + "completions/min_length": 494.0, + "completions/min_terminated_length": 494.0, + "entropy": 0.19707387685775757, + "epoch": 0.9310897435897436, + "frac_reward_zero_std": 0.0, + "grad_norm": 645.3209838867188, + "learning_rate": 1e-06, + "loss": 0.1367, + "num_tokens": 564833183.0, + "reward": 0.14636225998401642, + "reward_std": 0.13317109644412994, + "rewards/progression_diversity/mean": -0.0073294322937726974, + "rewards/progression_diversity/std": 0.031211169436573982, + "rewards/symbolic_reward_accuracy/mean": 0.03515625, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.4542643129825592, + "rewards/symbolic_reward_partial_score/std": 0.2500608563423157, + "rewards/tag_count_reward/mean": -0.109375, + "rewards/tag_count_reward/std": 0.31241437792778015, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0127942562103271, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 704.0, + "sampling/sampling_logp_difference/mean": 12.442346572875977, + "step": 581 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.1957729086279869, + "epoch": 0.9326923076923077, + "grad_norm": 0.03353438526391983, + "learning_rate": 1e-06, + "loss": 0.121, + "step": 582 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.19240333884954453, + "epoch": 0.9342948717948718, + "grad_norm": 6.355886936187744, + "learning_rate": 1e-06, + "loss": 0.1826, + "step": 583 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.1818048357963562, + "epoch": 0.9358974358974359, + "grad_norm": 0.03699595853686333, + "learning_rate": 1e-06, + "loss": 0.1854, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11314.0, + "completions/mean_length": 5186.060546875, + "completions/mean_terminated_length": 3755.486572265625, + "completions/min_length": 563.0, + "completions/min_terminated_length": 563.0, + "entropy": 0.1932847574353218, + "epoch": 0.9375, + "frac_reward_zero_std": 0.0, + "grad_norm": 567.5565185546875, + "learning_rate": 1e-06, + "loss": 0.1316, + "num_tokens": 568426046.0, + "reward": 0.18020448088645935, + "reward_std": 0.1428624838590622, + "rewards/progression_diversity/mean": -0.00933791883289814, + "rewards/progression_diversity/std": 0.04019331932067871, + "rewards/symbolic_reward_accuracy/mean": 0.07421875, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.4857584536075592, + "rewards/symbolic_reward_partial_score/std": 0.26446202397346497, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.010314702987671, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 704.0, + "sampling/sampling_logp_difference/mean": 13.445650100708008, + "step": 585 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.193020299077034, + "epoch": 0.9391025641025641, + "grad_norm": 0.04160737618803978, + "learning_rate": 1e-06, + "loss": 0.1617, + "step": 586 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.19402997195720673, + "epoch": 0.9407051282051282, + "grad_norm": 0.02597195655107498, + "learning_rate": 1e-06, + "loss": 0.1175, + "step": 587 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.1869238093495369, + "epoch": 0.9423076923076923, + "grad_norm": 0.028201278299093246, + "learning_rate": 1e-06, + "loss": 0.1579, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8968.0, + "completions/mean_length": 5071.556640625, + "completions/mean_terminated_length": 3765.32666015625, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "entropy": 0.20189929753541946, + "epoch": 0.9439102564102564, + "frac_reward_zero_std": 0.0, + "grad_norm": 732.089111328125, + "learning_rate": 1e-06, + "loss": 0.1303, + "num_tokens": 571972779.0, + "reward": 0.1497948169708252, + "reward_std": 0.12462593615055084, + "rewards/progression_diversity/mean": -0.00635932432487607, + "rewards/progression_diversity/std": 0.03013550490140915, + "rewards/symbolic_reward_accuracy/mean": 0.03515625, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.4578613340854645, + "rewards/symbolic_reward_partial_score/std": 0.25101757049560547, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.014004111289978, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 708.0, + "sampling/sampling_logp_difference/mean": 11.998255729675293, + "step": 589 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.19095805287361145, + "epoch": 0.9455128205128205, + "grad_norm": 124.67987060546875, + "learning_rate": 1e-06, + "loss": 0.1042, + "step": 590 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.19328931719064713, + "epoch": 0.9471153846153846, + "grad_norm": 0.03191890940070152, + "learning_rate": 1e-06, + "loss": 0.1363, + "step": 591 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.18839430809020996, + "epoch": 0.9487179487179487, + "grad_norm": 0.5878300666809082, + "learning_rate": 1e-06, + "loss": 0.1515, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8919.0, + "completions/mean_length": 5028.666015625, + "completions/mean_terminated_length": 3799.733642578125, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "entropy": 0.20535209774971008, + "epoch": 0.9503205128205128, + "frac_reward_zero_std": 0.0, + "grad_norm": 697.3111572265625, + "learning_rate": 1e-06, + "loss": 0.1668, + "num_tokens": 575335136.0, + "reward": 0.1823686808347702, + "reward_std": 0.14714612066745758, + "rewards/progression_diversity/mean": -0.006297005340456963, + "rewards/progression_diversity/std": 0.02788672409951687, + "rewards/symbolic_reward_accuracy/mean": 0.064453125, + "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, + "rewards/symbolic_reward_partial_score/mean": 0.5111002922058105, + "rewards/symbolic_reward_partial_score/std": 0.2809692621231079, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.015028953552246, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 712.0, + "sampling/sampling_logp_difference/mean": 12.842995643615723, + "step": 593 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2099175900220871, + "epoch": 0.9519230769230769, + "grad_norm": 0.03140028938651085, + "learning_rate": 1e-06, + "loss": 0.0722, + "step": 594 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.19498909264802933, + "epoch": 0.9535256410256411, + "grad_norm": 0.029133038595318794, + "learning_rate": 1e-06, + "loss": 0.1713, + "step": 595 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.20219064503908157, + "epoch": 0.9551282051282052, + "grad_norm": 253.06321716308594, + "learning_rate": 1e-06, + "loss": 0.1111, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7837.0, + "completions/mean_length": 4973.22265625, + "completions/mean_terminated_length": 3873.68310546875, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "entropy": 0.21754413098096848, + "epoch": 0.9567307692307693, + "frac_reward_zero_std": 0.0, + "grad_norm": 279.1087646484375, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 578751282.0, + "reward": 0.14907152950763702, + "reward_std": 0.11073170602321625, + "rewards/progression_diversity/mean": -0.004956918768584728, + "rewards/progression_diversity/std": 0.026602579280734062, + "rewards/symbolic_reward_accuracy/mean": 0.029296875, + "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, + "rewards/symbolic_reward_partial_score/mean": 0.4677734375, + "rewards/symbolic_reward_partial_score/std": 0.25228434801101685, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.02189040184021, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 712.0, + "sampling/sampling_logp_difference/mean": 9.754007339477539, + "step": 597 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.20453065633773804, + "epoch": 0.9583333333333334, + "grad_norm": 189.28167724609375, + "learning_rate": 1e-06, + "loss": 0.2374, + "step": 598 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.20873209089040756, + "epoch": 0.9599358974358975, + "grad_norm": 47.94603729248047, + "learning_rate": 1e-06, + "loss": 0.0853, + "step": 599 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.19657935947179794, + "epoch": 0.9615384615384616, + "grad_norm": 0.04143374040722847, + "learning_rate": 1e-06, + "loss": 0.2157, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13524.0, + "completions/mean_length": 5178.005859375, + "completions/mean_terminated_length": 3774.17822265625, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.20645452290773392, + "epoch": 0.9631410256410257, + "frac_reward_zero_std": 0.0, + "grad_norm": 354.8719482421875, + "learning_rate": 1e-06, + "loss": 0.1344, + "num_tokens": 582303205.0, + "reward": 0.1530710607767105, + "reward_std": 0.11708801239728928, + "rewards/progression_diversity/mean": -0.0058823819272220135, + "rewards/progression_diversity/std": 0.034123148769140244, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.4681152403354645, + "rewards/symbolic_reward_partial_score/std": 0.2567155063152313, + "rewards/tag_count_reward/mean": -0.107421875, + "rewards/tag_count_reward/std": 0.30995169281959534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.022865891456604, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 716.0, + "sampling/sampling_logp_difference/mean": 8.755578994750977, + "step": 601 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2079230472445488, + "epoch": 0.9647435897435898, + "grad_norm": 0.13820700347423553, + "learning_rate": 1e-06, + "loss": 0.154, + "step": 602 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2013714388012886, + "epoch": 0.9663461538461539, + "grad_norm": 0.05277189984917641, + "learning_rate": 1e-06, + "loss": 0.082, + "step": 603 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.20772968977689743, + "epoch": 0.967948717948718, + "grad_norm": 164.80355834960938, + "learning_rate": 1e-06, + "loss": 0.1195, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14972.0, + "completions/mean_length": 5488.85546875, + "completions/mean_terminated_length": 3932.406494140625, + "completions/min_length": 697.0, + "completions/min_terminated_length": 697.0, + "entropy": 0.18739153444766998, + "epoch": 0.969551282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 190.9339141845703, + "learning_rate": 1e-06, + "loss": 0.1891, + "num_tokens": 585997131.0, + "reward": 0.13834749162197113, + "reward_std": 0.10280528664588928, + "rewards/progression_diversity/mean": -0.004118788987398148, + "rewards/progression_diversity/std": 0.02404225990176201, + "rewards/symbolic_reward_accuracy/mean": 0.013671875, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.47366535663604736, + "rewards/symbolic_reward_partial_score/std": 0.26022228598594666, + "rewards/tag_count_reward/mean": -0.119140625, + "rewards/tag_count_reward/std": 0.32427072525024414, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.021909475326538, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 716.0, + "sampling/sampling_logp_difference/mean": 8.64652156829834, + "step": 605 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.19497393816709518, + "epoch": 0.9711538461538461, + "grad_norm": 1706.4573974609375, + "learning_rate": 1e-06, + "loss": 0.2708, + "step": 606 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.20221728086471558, + "epoch": 0.9727564102564102, + "grad_norm": 0.10873478651046753, + "learning_rate": 1e-06, + "loss": 0.1281, + "step": 607 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.20786841213703156, + "epoch": 0.9743589743589743, + "grad_norm": 0.033346377313137054, + "learning_rate": 1e-06, + "loss": 0.0572, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6816.0, + "completions/mean_length": 5060.9296875, + "completions/mean_terminated_length": 3916.447509765625, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 0.21419016271829605, + "epoch": 0.9759615384615384, + "frac_reward_zero_std": 0.0, + "grad_norm": 586.444580078125, + "learning_rate": 1e-06, + "loss": 0.0856, + "num_tokens": 589427335.0, + "reward": 0.16882535815238953, + "reward_std": 0.1255710870027542, + "rewards/progression_diversity/mean": -0.005649726837873459, + "rewards/progression_diversity/std": 0.029928982257843018, + "rewards/symbolic_reward_accuracy/mean": 0.044921875, + "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, + "rewards/symbolic_reward_partial_score/mean": 0.500439465045929, + "rewards/symbolic_reward_partial_score/std": 0.2657742500305176, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.023430585861206, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 9.106830596923828, + "step": 609 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.18879622966051102, + "epoch": 0.9775641025641025, + "grad_norm": 236.33624267578125, + "learning_rate": 1e-06, + "loss": 0.2667, + "step": 610 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2075936570763588, + "epoch": 0.9791666666666666, + "grad_norm": 2521.72265625, + "learning_rate": 1e-06, + "loss": 0.1519, + "step": 611 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2186291366815567, + "epoch": 0.9807692307692307, + "grad_norm": 0.11578943580389023, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7141.0, + "completions/mean_length": 5257.30859375, + "completions/mean_terminated_length": 4053.12109375, + "completions/min_length": 1060.0, + "completions/min_terminated_length": 1060.0, + "entropy": 0.1974191665649414, + "epoch": 0.9823717948717948, + "frac_reward_zero_std": 0.0, + "grad_norm": 265.5615539550781, + "learning_rate": 1e-06, + "loss": 0.22, + "num_tokens": 592999621.0, + "reward": 0.16088539361953735, + "reward_std": 0.10721283406019211, + "rewards/progression_diversity/mean": -0.0037470583338290453, + "rewards/progression_diversity/std": 0.02392842248082161, + "rewards/symbolic_reward_accuracy/mean": 0.046875, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.47390952706336975, + "rewards/symbolic_reward_partial_score/std": 0.23947234451770782, + "rewards/tag_count_reward/mean": -0.09375, + "rewards/tag_count_reward/std": 0.29176566004753113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0285604000091553, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 6.232402801513672, + "step": 613 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.21706271171569824, + "epoch": 0.9839743589743589, + "grad_norm": 286162.34375, + "learning_rate": 1e-06, + "loss": 93.3376, + "step": 614 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2068658247590065, + "epoch": 0.9855769230769231, + "grad_norm": 8.328322410583496, + "learning_rate": 1e-06, + "loss": 0.1488, + "step": 615 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.21251578629016876, + "epoch": 0.9871794871794872, + "grad_norm": 284.380615234375, + "learning_rate": 1e-06, + "loss": 0.1166, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9352.0, + "completions/mean_length": 5079.92578125, + "completions/mean_terminated_length": 4017.149658203125, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "entropy": 0.20627272129058838, + "epoch": 0.9887820512820513, + "frac_reward_zero_std": 0.0, + "grad_norm": 218.53688049316406, + "learning_rate": 1e-06, + "loss": 0.0771, + "num_tokens": 596455135.0, + "reward": 0.16188350319862366, + "reward_std": 0.10645517706871033, + "rewards/progression_diversity/mean": -0.0030562267638742924, + "rewards/progression_diversity/std": 0.020391173660755157, + "rewards/symbolic_reward_accuracy/mean": 0.041015625, + "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, + "rewards/symbolic_reward_partial_score/mean": 0.484375, + "rewards/symbolic_reward_partial_score/std": 0.23614925146102905, + "rewards/tag_count_reward/mean": -0.080078125, + "rewards/tag_count_reward/std": 0.271679550409317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.029923439025879, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 5.815011024475098, + "step": 617 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.21643763035535812, + "epoch": 0.9903846153846154, + "grad_norm": 8340.802734375, + "learning_rate": 1e-06, + "loss": 0.641, + "step": 618 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.21413151174783707, + "epoch": 0.9919871794871795, + "grad_norm": 0.045055363327264786, + "learning_rate": 1e-06, + "loss": 0.0645, + "step": 619 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.20299791544675827, + "epoch": 0.9935897435897436, + "grad_norm": 2.9305126667022705, + "learning_rate": 1e-06, + "loss": 0.1387, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9045.0, + "completions/mean_length": 4689.990234375, + "completions/mean_terminated_length": 3910.389892578125, + "completions/min_length": 613.0, + "completions/min_terminated_length": 613.0, + "entropy": 0.2107323333621025, + "epoch": 0.9951923076923077, + "frac_reward_zero_std": 0.0, + "grad_norm": 822.1503295898438, + "learning_rate": 1e-06, + "loss": 0.1441, + "num_tokens": 599732858.0, + "reward": 0.1449926495552063, + "reward_std": 0.08904429525136948, + "rewards/progression_diversity/mean": -0.0031782728619873524, + "rewards/progression_diversity/std": 0.03115173615515232, + "rewards/symbolic_reward_accuracy/mean": 0.02734375, + "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, + "rewards/symbolic_reward_partial_score/mean": 0.4482584595680237, + "rewards/symbolic_reward_partial_score/std": 0.2407679557800293, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0356639623641968, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 3.8055062294006348, + "step": 621 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.21765735745429993, + "epoch": 0.9967948717948718, + "grad_norm": 0.11155123263597488, + "learning_rate": 1e-06, + "loss": 0.074, + "step": 622 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2163386046886444, + "epoch": 0.9983974358974359, + "grad_norm": 0.035584624856710434, + "learning_rate": 1e-06, + "loss": 0.0425, + "step": 623 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2188253104686737, + "epoch": 1.0, + "grad_norm": 0.041072484105825424, + "learning_rate": 1e-06, + "loss": 0.127, + "step": 624 + }, + { + "epoch": 1.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.05615234375, + "eval_completions/max_length": 16384.0, + "eval_completions/max_terminated_length": 6932.09375, + "eval_completions/mean_length": 4607.2421875, + "eval_completions/mean_terminated_length": 3907.518020629883, + "eval_completions/min_length": 1083.65625, + "eval_completions/min_terminated_length": 1083.65625, + "eval_entropy": 0.21419930551201105, + "eval_frac_reward_zero_std": 0.0078125, + "eval_loss": 0.040855128318071365, + "eval_num_tokens": 599732858.0, + "eval_reward": 0.15359255392104387, + "eval_reward_std": 0.0873756860382855, + "eval_rewards/progression_diversity/mean": -0.002012389669403092, + "eval_rewards/progression_diversity/std": 0.016667645196321246, + "eval_rewards/symbolic_reward_accuracy/mean": 0.02294921875, + "eval_rewards/symbolic_reward_accuracy/std": 0.11482575349509716, + "eval_rewards/symbolic_reward_partial_score/mean": 0.4836405459791422, + "eval_rewards/symbolic_reward_partial_score/std": 0.2179896729066968, + "eval_rewards/tag_count_reward/mean": -0.052490234375, + "eval_rewards/tag_count_reward/std": 0.21463490999303758, + "eval_runtime": 4410.5978, + "eval_samples_per_second": 0.057, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0419876985251904, + "eval_sampling/importance_sampling_ratio/min": 0.0, + "eval_sampling/sampling_logp_difference/max": 687.4686508178711, + "eval_sampling/sampling_logp_difference/mean": 2.2834634024184197, + "eval_steps_per_second": 0.0, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7068.0, + "completions/mean_length": 4668.310546875, + "completions/mean_terminated_length": 3861.175537109375, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "entropy": 0.2172800451517105, + "epoch": 1.001602564102564, + "frac_reward_zero_std": 0.0, + "grad_norm": 199.8529510498047, + "learning_rate": 1e-06, + "loss": 0.0884, + "num_tokens": 603060937.0, + "reward": 0.1484626829624176, + "reward_std": 0.08050423860549927, + "rewards/progression_diversity/mean": -0.001877149916253984, + "rewards/progression_diversity/std": 0.01999068818986416, + "rewards/symbolic_reward_accuracy/mean": 0.01953125, + "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, + "rewards/symbolic_reward_partial_score/mean": 0.4773600399494171, + "rewards/symbolic_reward_partial_score/std": 0.22182992100715637, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0379219055175781, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 2.98622727394104, + "step": 625 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.22022734582424164, + "epoch": 1.0032051282051282, + "grad_norm": 20.245939254760742, + "learning_rate": 1e-06, + "loss": 0.0854, + "step": 626 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22906509041786194, + "epoch": 1.0048076923076923, + "grad_norm": 0.04052429273724556, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 627 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.3359375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.20621829479932785, + "epoch": 1.0064102564102564, + "grad_norm": 0.6211011409759521, + "learning_rate": 1e-06, + "loss": 0.1451, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6794.0, + "completions/mean_length": 4257.8671875, + "completions/mean_terminated_length": 3764.934814453125, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "entropy": 0.2293422520160675, + "epoch": 1.0080128205128205, + "frac_reward_zero_std": 0.0, + "grad_norm": 39.52851104736328, + "learning_rate": 1e-06, + "loss": 0.0808, + "num_tokens": 606104485.0, + "reward": 0.172392338514328, + "reward_std": 0.10835404694080353, + "rewards/progression_diversity/mean": -0.001978690503165126, + "rewards/progression_diversity/std": 0.029427627101540565, + "rewards/symbolic_reward_accuracy/mean": 0.046875, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.4920247495174408, + "rewards/symbolic_reward_partial_score/std": 0.22508575022220612, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.044852375984192, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 1.5544822216033936, + "step": 629 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.23360887169837952, + "epoch": 1.0096153846153846, + "grad_norm": 0.04223166033625603, + "learning_rate": 1e-06, + "loss": 0.0465, + "step": 630 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2338651716709137, + "epoch": 1.0112179487179487, + "grad_norm": 5.134727478027344, + "learning_rate": 1e-06, + "loss": 0.0475, + "step": 631 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.22925083339214325, + "epoch": 1.0128205128205128, + "grad_norm": 0.20785175263881683, + "learning_rate": 1e-06, + "loss": 0.0922, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7371.0, + "completions/mean_length": 4059.546875, + "completions/mean_terminated_length": 3713.076171875, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.23891044408082962, + "epoch": 1.0144230769230769, + "frac_reward_zero_std": 0.0, + "grad_norm": 379.1642150878906, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 609025469.0, + "reward": 0.19592517614364624, + "reward_std": 0.10350589454174042, + "rewards/progression_diversity/mean": -0.0012319717789068818, + "rewards/progression_diversity/std": 0.013662992045283318, + "rewards/symbolic_reward_accuracy/mean": 0.078125, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.5053385496139526, + "rewards/symbolic_reward_partial_score/std": 0.2347535789012909, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0454130172729492, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 1.7672368288040161, + "step": 633 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.23257441073656082, + "epoch": 1.016025641025641, + "grad_norm": 0.036344923079013824, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 634 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.22513434290885925, + "epoch": 1.017628205128205, + "grad_norm": 0.2888374924659729, + "learning_rate": 1e-06, + "loss": 0.1695, + "step": 635 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.23959070444107056, + "epoch": 1.0192307692307692, + "grad_norm": 0.04031003266572952, + "learning_rate": 1e-06, + "loss": 0.0214, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6660.0, + "completions/mean_length": 4129.09375, + "completions/mean_terminated_length": 3784.578125, + "completions/min_length": 864.0, + "completions/min_terminated_length": 864.0, + "entropy": 0.22954071313142776, + "epoch": 1.0208333333333333, + "frac_reward_zero_std": 0.03125, + "grad_norm": 803.85888671875, + "learning_rate": 1e-06, + "loss": 0.0414, + "num_tokens": 611954541.0, + "reward": 0.17702984809875488, + "reward_std": 0.10909046232700348, + "rewards/progression_diversity/mean": -0.0016061984933912754, + "rewards/progression_diversity/std": 0.013535745441913605, + "rewards/symbolic_reward_accuracy/mean": 0.046875, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.5055176019668579, + "rewards/symbolic_reward_partial_score/std": 0.2366805523633957, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.040963053703308, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 4.412889003753662, + "step": 637 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23653806746006012, + "epoch": 1.0224358974358974, + "grad_norm": 0.032999370247125626, + "learning_rate": 1e-06, + "loss": 0.0386, + "step": 638 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.23699220269918442, + "epoch": 1.0240384615384615, + "grad_norm": 0.044161055237054825, + "learning_rate": 1e-06, + "loss": 0.0856, + "step": 639 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2416703775525093, + "epoch": 1.0256410256410255, + "grad_norm": 0.04160209372639656, + "learning_rate": 1e-06, + "loss": -0.0036, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14949.0, + "completions/mean_length": 3966.068359375, + "completions/mean_terminated_length": 3616.9697265625, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.2398141846060753, + "epoch": 1.0272435897435896, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.787557601928711, + "learning_rate": 1e-06, + "loss": 0.0664, + "num_tokens": 614793424.0, + "reward": 0.1760445237159729, + "reward_std": 0.09620662778615952, + "rewards/progression_diversity/mean": -0.001016853959299624, + "rewards/progression_diversity/std": 0.013255119323730469, + "rewards/symbolic_reward_accuracy/mean": 0.04296875, + "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, + "rewards/symbolic_reward_partial_score/mean": 0.5093749761581421, + "rewards/symbolic_reward_partial_score/std": 0.20591585338115692, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0430570840835571, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 3.0360569953918457, + "step": 641 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.23307596892118454, + "epoch": 1.0288461538461537, + "grad_norm": 0.9455596208572388, + "learning_rate": 1e-06, + "loss": 0.0624, + "step": 642 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.23648811876773834, + "epoch": 1.0304487179487178, + "grad_norm": 0.03208564594388008, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 643 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.23585931956768036, + "epoch": 1.032051282051282, + "grad_norm": 0.05858050286769867, + "learning_rate": 1e-06, + "loss": 0.087, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6913.0, + "completions/mean_length": 3554.791015625, + "completions/mean_terminated_length": 3194.13037109375, + "completions/min_length": 679.0, + "completions/min_terminated_length": 679.0, + "entropy": 0.23475763201713562, + "epoch": 1.0336538461538463, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5429258346557617, + "learning_rate": 1e-06, + "loss": 0.0621, + "num_tokens": 617548181.0, + "reward": 0.1708458811044693, + "reward_std": 0.0782819613814354, + "rewards/progression_diversity/mean": -0.0018388191238045692, + "rewards/progression_diversity/std": 0.020668642595410347, + "rewards/symbolic_reward_accuracy/mean": 0.02734375, + "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, + "rewards/symbolic_reward_partial_score/mean": 0.5226725339889526, + "rewards/symbolic_reward_partial_score/std": 0.20760734379291534, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.043994665145874, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 2.3344595432281494, + "step": 645 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.231798954308033, + "epoch": 1.0352564102564104, + "grad_norm": 0.04138017073273659, + "learning_rate": 1e-06, + "loss": 0.0803, + "step": 646 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.24344021826982498, + "epoch": 1.0368589743589745, + "grad_norm": 0.03530874475836754, + "learning_rate": 1e-06, + "loss": 0.0385, + "step": 647 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2350650280714035, + "epoch": 1.0384615384615385, + "grad_norm": 0.035837192088365555, + "learning_rate": 1e-06, + "loss": 0.0492, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5463.0, + "completions/mean_length": 3913.376953125, + "completions/mean_terminated_length": 3380.01025390625, + "completions/min_length": 648.0, + "completions/min_terminated_length": 648.0, + "entropy": 0.21108438819646835, + "epoch": 1.0400641025641026, + "frac_reward_zero_std": 0.03125, + "grad_norm": 306.6093444824219, + "learning_rate": 1e-06, + "loss": 0.151, + "num_tokens": 620433494.0, + "reward": 0.18476390838623047, + "reward_std": 0.11055901646614075, + "rewards/progression_diversity/mean": -0.001637741457670927, + "rewards/progression_diversity/std": 0.015057054348289967, + "rewards/symbolic_reward_accuracy/mean": 0.064453125, + "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, + "rewards/symbolic_reward_partial_score/mean": 0.5006998777389526, + "rewards/symbolic_reward_partial_score/std": 0.23833897709846497, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.038619875907898, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 3.4865543842315674, + "step": 649 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23450932651758194, + "epoch": 1.0416666666666667, + "grad_norm": 0.03884749859571457, + "learning_rate": 1e-06, + "loss": 0.0365, + "step": 650 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.22569099068641663, + "epoch": 1.0432692307692308, + "grad_norm": 0.039085451513528824, + "learning_rate": 1e-06, + "loss": 0.045, + "step": 651 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.22521612793207169, + "epoch": 1.044871794871795, + "grad_norm": 0.03394408896565437, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6388.0, + "completions/mean_length": 4014.796875, + "completions/mean_terminated_length": 3299.222900390625, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.21447355300188065, + "epoch": 1.046474358974359, + "frac_reward_zero_std": 0.0, + "grad_norm": 3.4653165340423584, + "learning_rate": 1e-06, + "loss": 0.0665, + "num_tokens": 623435790.0, + "reward": 0.18457384407520294, + "reward_std": 0.12713655829429626, + "rewards/progression_diversity/mean": -0.00013509648852050304, + "rewards/progression_diversity/std": 0.003056884743273258, + "rewards/symbolic_reward_accuracy/mean": 0.05859375, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.5162923336029053, + "rewards/symbolic_reward_partial_score/std": 0.25753483176231384, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0391998291015625, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.8635755777359009, + "step": 653 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.21595758199691772, + "epoch": 1.0480769230769231, + "grad_norm": 839.4369506835938, + "learning_rate": 1e-06, + "loss": 0.1091, + "step": 654 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.20685593783855438, + "epoch": 1.0496794871794872, + "grad_norm": 0.036022186279296875, + "learning_rate": 1e-06, + "loss": 0.1617, + "step": 655 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.21082329750061035, + "epoch": 1.0512820512820513, + "grad_norm": 0.031025860458612442, + "learning_rate": 1e-06, + "loss": 0.0654, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6231.0, + "completions/mean_length": 4426.666015625, + "completions/mean_terminated_length": 3302.472412109375, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "entropy": 0.20376238226890564, + "epoch": 1.0528846153846154, + "frac_reward_zero_std": 0.03125, + "grad_norm": 223.56224060058594, + "learning_rate": 1e-06, + "loss": 0.0537, + "num_tokens": 626540099.0, + "reward": 0.1801775097846985, + "reward_std": 0.12979283928871155, + "rewards/progression_diversity/mean": -0.00031599291833117604, + "rewards/progression_diversity/std": 0.005377059802412987, + "rewards/symbolic_reward_accuracy/mean": 0.064453125, + "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, + "rewards/symbolic_reward_partial_score/mean": 0.5003417730331421, + "rewards/symbolic_reward_partial_score/std": 0.23511230945587158, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0354280471801758, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 1.9148916006088257, + "step": 657 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.20688576996326447, + "epoch": 1.0544871794871795, + "grad_norm": 141.19863891601562, + "learning_rate": 1e-06, + "loss": 0.145, + "step": 658 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.20776833593845367, + "epoch": 1.0560897435897436, + "grad_norm": 4.635189056396484, + "learning_rate": 1e-06, + "loss": 0.1138, + "step": 659 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.21761061251163483, + "epoch": 1.0576923076923077, + "grad_norm": 0.028185199946165085, + "learning_rate": 1e-06, + "loss": 0.1123, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 4155.037109375, + "completions/mean_terminated_length": 3312.540771484375, + "completions/min_length": 717.0, + "completions/min_terminated_length": 717.0, + "entropy": 0.2150147706270218, + "epoch": 1.0592948717948718, + "frac_reward_zero_std": 0.03125, + "grad_norm": 94.42283630371094, + "learning_rate": 1e-06, + "loss": 0.0893, + "num_tokens": 629565094.0, + "reward": 0.16469581425189972, + "reward_std": 0.10881762206554413, + "rewards/progression_diversity/mean": -0.0006340488907881081, + "rewards/progression_diversity/std": 0.008337623439729214, + "rewards/symbolic_reward_accuracy/mean": 0.029296875, + "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, + "rewards/symbolic_reward_partial_score/mean": 0.5112468004226685, + "rewards/symbolic_reward_partial_score/std": 0.23792609572410583, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0353161096572876, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 2.2100651264190674, + "step": 661 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.21428611874580383, + "epoch": 1.060897435897436, + "grad_norm": 114.22917938232422, + "learning_rate": 1e-06, + "loss": 0.1123, + "step": 662 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2211027517914772, + "epoch": 1.0625, + "grad_norm": 0.0436987467110157, + "learning_rate": 1e-06, + "loss": 0.0534, + "step": 663 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.18693063408136368, + "epoch": 1.064102564102564, + "grad_norm": 2.552537679672241, + "learning_rate": 1e-06, + "loss": 0.2344, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6130.0, + "completions/mean_length": 4010.59765625, + "completions/mean_terminated_length": 3294.78076171875, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 0.2113461047410965, + "epoch": 1.0657051282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 246.3715362548828, + "learning_rate": 1e-06, + "loss": 0.0674, + "num_tokens": 632547896.0, + "reward": 0.1795777976512909, + "reward_std": 0.10678594559431076, + "rewards/progression_diversity/mean": -0.00022982530936133116, + "rewards/progression_diversity/std": 0.0034447184298187494, + "rewards/symbolic_reward_accuracy/mean": 0.052734375, + "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, + "rewards/symbolic_reward_partial_score/mean": 0.5113606452941895, + "rewards/symbolic_reward_partial_score/std": 0.2122807502746582, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0385127067565918, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 1.2567355632781982, + "step": 665 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2206530198454857, + "epoch": 1.0673076923076923, + "grad_norm": 0.2656633257865906, + "learning_rate": 1e-06, + "loss": 0.0547, + "step": 666 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2168770357966423, + "epoch": 1.0689102564102564, + "grad_norm": 3.0182042121887207, + "learning_rate": 1e-06, + "loss": 0.125, + "step": 667 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.2043883576989174, + "epoch": 1.0705128205128205, + "grad_norm": 0.024378696456551552, + "learning_rate": 1e-06, + "loss": 0.1789, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15758.0, + "completions/mean_length": 4331.119140625, + "completions/mean_terminated_length": 3446.73583984375, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "entropy": 0.20979925245046616, + "epoch": 1.0721153846153846, + "frac_reward_zero_std": 0.03125, + "grad_norm": 180.8795928955078, + "learning_rate": 1e-06, + "loss": 0.1064, + "num_tokens": 635626965.0, + "reward": 0.15303614735603333, + "reward_std": 0.08873963356018066, + "rewards/progression_diversity/mean": -9.739572124090046e-05, + "rewards/progression_diversity/std": 0.0017328658141195774, + "rewards/symbolic_reward_accuracy/mean": 0.013671875, + "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, + "rewards/symbolic_reward_partial_score/mean": 0.5055663585662842, + "rewards/symbolic_reward_partial_score/std": 0.22250817716121674, + "rewards/tag_count_reward/mean": -0.068359375, + "rewards/tag_count_reward/std": 0.25260838866233826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.034943699836731, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 2.280566692352295, + "step": 669 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.19934061914682388, + "epoch": 1.0737179487179487, + "grad_norm": 0.06182999163866043, + "learning_rate": 1e-06, + "loss": 0.1761, + "step": 670 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.21370511502027512, + "epoch": 1.0753205128205128, + "grad_norm": 0.03356883302330971, + "learning_rate": 1e-06, + "loss": 0.0624, + "step": 671 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.20709837973117828, + "epoch": 1.0769230769230769, + "grad_norm": 7.616127967834473, + "learning_rate": 1e-06, + "loss": 0.1033, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16086.0, + "completions/mean_length": 3967.392578125, + "completions/mean_terminated_length": 3488.862060546875, + "completions/min_length": 503.0, + "completions/min_terminated_length": 503.0, + "entropy": 0.21507354080677032, + "epoch": 1.078525641025641, + "frac_reward_zero_std": 0.0, + "grad_norm": 239.9991912841797, + "learning_rate": 1e-06, + "loss": 0.0799, + "num_tokens": 638507678.0, + "reward": 0.22426804900169373, + "reward_std": 0.14125576615333557, + "rewards/progression_diversity/mean": -0.00044275925029069185, + "rewards/progression_diversity/std": 0.008287720382213593, + "rewards/symbolic_reward_accuracy/mean": 0.10546875, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.5490071773529053, + "rewards/symbolic_reward_partial_score/std": 0.2383054494857788, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0389597415924072, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 1.678175687789917, + "step": 673 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.21100656688213348, + "epoch": 1.080128205128205, + "grad_norm": 2.742002487182617, + "learning_rate": 1e-06, + "loss": 0.0992, + "step": 674 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.21375902742147446, + "epoch": 1.0817307692307692, + "grad_norm": 0.04870569705963135, + "learning_rate": 1e-06, + "loss": 0.077, + "step": 675 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2123679220676422, + "epoch": 1.0833333333333333, + "grad_norm": 0.03225383535027504, + "learning_rate": 1e-06, + "loss": 0.0213, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7501.0, + "completions/mean_length": 3955.189453125, + "completions/mean_terminated_length": 3528.341552734375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.19846639037132263, + "epoch": 1.0849358974358974, + "frac_reward_zero_std": 0.0, + "grad_norm": 93.89030456542969, + "learning_rate": 1e-06, + "loss": 0.1439, + "num_tokens": 641467087.0, + "reward": 0.18885713815689087, + "reward_std": 0.09211981296539307, + "rewards/progression_diversity/mean": -2.9897617423557676e-05, + "rewards/progression_diversity/std": 0.0006765059079043567, + "rewards/symbolic_reward_accuracy/mean": 0.05859375, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.5234049558639526, + "rewards/symbolic_reward_partial_score/std": 0.22506952285766602, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0395513772964478, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 696.0, + "sampling/sampling_logp_difference/mean": 0.8175865411758423, + "step": 677 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.21475867927074432, + "epoch": 1.0865384615384615, + "grad_norm": 0.04221729189157486, + "learning_rate": 1e-06, + "loss": 0.0553, + "step": 678 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2134266495704651, + "epoch": 1.0881410256410255, + "grad_norm": 0.03336584195494652, + "learning_rate": 1e-06, + "loss": 0.0467, + "step": 679 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.21357693523168564, + "epoch": 1.0897435897435896, + "grad_norm": 0.08009527623653412, + "learning_rate": 1e-06, + "loss": 0.0293, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6588.0, + "completions/mean_length": 3841.345703125, + "completions/mean_terminated_length": 3410.588134765625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.21339119970798492, + "epoch": 1.0913461538461537, + "frac_reward_zero_std": 0.0, + "grad_norm": 614.2329711914062, + "learning_rate": 1e-06, + "loss": 0.1081, + "num_tokens": 644259776.0, + "reward": 0.2062734067440033, + "reward_std": 0.1276092380285263, + "rewards/progression_diversity/mean": -0.0005905528087168932, + "rewards/progression_diversity/std": 0.006680304650217295, + "rewards/symbolic_reward_accuracy/mean": 0.076171875, + "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, + "rewards/symbolic_reward_partial_score/mean": 0.5463216304779053, + "rewards/symbolic_reward_partial_score/std": 0.24570463597774506, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0360360145568848, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 3.0028653144836426, + "step": 681 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.21380823105573654, + "epoch": 1.092948717948718, + "grad_norm": 0.03602517396211624, + "learning_rate": 1e-06, + "loss": 0.0344, + "step": 682 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.21220668405294418, + "epoch": 1.094551282051282, + "grad_norm": 0.03208322450518608, + "learning_rate": 1e-06, + "loss": 0.0598, + "step": 683 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2145799621939659, + "epoch": 1.0961538461538463, + "grad_norm": 0.026013914495706558, + "learning_rate": 1e-06, + "loss": 0.0583, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6038.0, + "completions/mean_length": 3467.587890625, + "completions/mean_terminated_length": 3131.088134765625, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.21561385691165924, + "epoch": 1.0977564102564104, + "frac_reward_zero_std": 0.0, + "grad_norm": 72.71430206298828, + "learning_rate": 1e-06, + "loss": 0.0558, + "num_tokens": 647003501.0, + "reward": 0.19664551317691803, + "reward_std": 0.09150449931621552, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.05859375, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.5467610359191895, + "rewards/symbolic_reward_partial_score/std": 0.21742354333400726, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0402486324310303, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 696.0, + "sampling/sampling_logp_difference/mean": 0.6293116211891174, + "step": 685 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.21327327191829681, + "epoch": 1.0993589743589745, + "grad_norm": 0.03850381448864937, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 686 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.21018948405981064, + "epoch": 1.1009615384615385, + "grad_norm": 696.9727783203125, + "learning_rate": 1e-06, + "loss": 0.2903, + "step": 687 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2201865389943123, + "epoch": 1.1025641025641026, + "grad_norm": 0.04112207144498825, + "learning_rate": 1e-06, + "loss": 0.0406, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8348.0, + "completions/mean_length": 3814.763671875, + "completions/mean_terminated_length": 3487.30859375, + "completions/min_length": 676.0, + "completions/min_terminated_length": 676.0, + "entropy": 0.21837550401687622, + "epoch": 1.1041666666666667, + "frac_reward_zero_std": 0.0, + "grad_norm": 371.62738037109375, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 649739428.0, + "reward": 0.1545269787311554, + "reward_std": 0.053041648119688034, + "rewards/progression_diversity/mean": -0.00042648223461583257, + "rewards/progression_diversity/std": 0.006124202162027359, + "rewards/symbolic_reward_accuracy/mean": 0.001953125, + "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, + "rewards/symbolic_reward_partial_score/mean": 0.5196614265441895, + "rewards/symbolic_reward_partial_score/std": 0.16844609379768372, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0396302938461304, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 2.3374648094177246, + "step": 689 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.21820075809955597, + "epoch": 1.1057692307692308, + "grad_norm": 0.03433378040790558, + "learning_rate": 1e-06, + "loss": 0.1054, + "step": 690 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.21881910413503647, + "epoch": 1.107371794871795, + "grad_norm": 0.03191978111863136, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 691 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.22215040028095245, + "epoch": 1.108974358974359, + "grad_norm": 0.02860073745250702, + "learning_rate": 1e-06, + "loss": 0.0554, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7847.0, + "completions/mean_length": 3363.330078125, + "completions/mean_terminated_length": 3103.954345703125, + "completions/min_length": 541.0, + "completions/min_terminated_length": 541.0, + "entropy": 0.21577662229537964, + "epoch": 1.1105769230769231, + "frac_reward_zero_std": 0.0, + "grad_norm": 299.27655029296875, + "learning_rate": 1e-06, + "loss": 0.1036, + "num_tokens": 652366973.0, + "reward": 0.23104004561901093, + "reward_std": 0.10753922909498215, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.099609375, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.5774251222610474, + "rewards/symbolic_reward_partial_score/std": 0.2207508236169815, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0428193807601929, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 712.0, + "sampling/sampling_logp_difference/mean": 0.8698887228965759, + "step": 693 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22989095747470856, + "epoch": 1.1121794871794872, + "grad_norm": 0.051207710057497025, + "learning_rate": 1e-06, + "loss": -0.0037, + "step": 694 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2215583696961403, + "epoch": 1.1137820512820513, + "grad_norm": 0.03025507554411888, + "learning_rate": 1e-06, + "loss": 0.0495, + "step": 695 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22304167598485947, + "epoch": 1.1153846153846154, + "grad_norm": 0.03082253411412239, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6001.0, + "completions/mean_length": 3574.69140625, + "completions/mean_terminated_length": 3371.369140625, + "completions/min_length": 582.0, + "completions/min_terminated_length": 582.0, + "entropy": 0.22506634891033173, + "epoch": 1.1169871794871795, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03984767943620682, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 655022047.0, + "reward": 0.17143532633781433, + "reward_std": 0.09424548596143723, + "rewards/progression_diversity/mean": -2.249992212455254e-05, + "rewards/progression_diversity/std": 0.000509115110617131, + "rewards/symbolic_reward_accuracy/mean": 0.037109375, + "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, + "rewards/symbolic_reward_partial_score/mean": 0.50244140625, + "rewards/symbolic_reward_partial_score/std": 0.21935398876667023, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0433446168899536, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 1.2426462173461914, + "step": 697 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.22180306166410446, + "epoch": 1.1185897435897436, + "grad_norm": 108.0488052368164, + "learning_rate": 1e-06, + "loss": 0.0712, + "step": 698 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.227100670337677, + "epoch": 1.1201923076923077, + "grad_norm": 0.034184910356998444, + "learning_rate": 1e-06, + "loss": 0.039, + "step": 699 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2247518002986908, + "epoch": 1.1217948717948718, + "grad_norm": 0.03340495377779007, + "learning_rate": 1e-06, + "loss": 0.0312, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6562.0, + "completions/mean_length": 3566.66796875, + "completions/mean_terminated_length": 2991.19580078125, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "entropy": 0.20850242674350739, + "epoch": 1.123397435897436, + "frac_reward_zero_std": 0.0, + "grad_norm": 87.1917495727539, + "learning_rate": 1e-06, + "loss": 0.1712, + "num_tokens": 657830693.0, + "reward": 0.1684723198413849, + "reward_std": 0.07193771004676819, + "rewards/progression_diversity/mean": -0.0009145288495346904, + "rewards/progression_diversity/std": 0.013037758879363537, + "rewards/symbolic_reward_accuracy/mean": 0.01171875, + "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, + "rewards/symbolic_reward_partial_score/mean": 0.5511881113052368, + "rewards/symbolic_reward_partial_score/std": 0.19447360932826996, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0334614515304565, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 4.139370918273926, + "step": 701 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.21462332457304, + "epoch": 1.125, + "grad_norm": 0.03925897926092148, + "learning_rate": 1e-06, + "loss": 0.1, + "step": 702 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2194051519036293, + "epoch": 1.126602564102564, + "grad_norm": 0.032283466309309006, + "learning_rate": 1e-06, + "loss": 0.068, + "step": 703 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2256844863295555, + "epoch": 1.1282051282051282, + "grad_norm": 0.027121527120471, + "learning_rate": 1e-06, + "loss": 0.0355, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7975.0, + "completions/mean_length": 3463.818359375, + "completions/mean_terminated_length": 3284.726806640625, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 0.22059781849384308, + "epoch": 1.1298076923076923, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.042044855654239655, + "learning_rate": 1e-06, + "loss": 0.0491, + "num_tokens": 660390200.0, + "reward": 0.19374506175518036, + "reward_std": 0.11770647019147873, + "rewards/progression_diversity/mean": -6.273954568314366e-06, + "rewards/progression_diversity/std": 0.00014196339179761708, + "rewards/symbolic_reward_accuracy/mean": 0.06640625, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.5175618529319763, + "rewards/symbolic_reward_partial_score/std": 0.2401990294456482, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0451421737670898, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 712.0, + "sampling/sampling_logp_difference/mean": 0.5323032736778259, + "step": 705 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2260003685951233, + "epoch": 1.1314102564102564, + "grad_norm": 0.026536772027611732, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 706 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2393200546503067, + "epoch": 1.1330128205128205, + "grad_norm": 0.036755435168743134, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 707 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.23273401707410812, + "epoch": 1.1346153846153846, + "grad_norm": 0.02972438745200634, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5406.0, + "completions/mean_length": 3774.552734375, + "completions/mean_terminated_length": 3288.590087890625, + "completions/min_length": 762.0, + "completions/min_terminated_length": 762.0, + "entropy": 0.218715637922287, + "epoch": 1.1362179487179487, + "frac_reward_zero_std": 0.0, + "grad_norm": 1383.7510986328125, + "learning_rate": 1e-06, + "loss": 0.0569, + "num_tokens": 663134643.0, + "reward": 0.18339823186397552, + "reward_std": 0.08036380261182785, + "rewards/progression_diversity/mean": -0.0005094322841614485, + "rewards/progression_diversity/std": 0.008040121756494045, + "rewards/symbolic_reward_accuracy/mean": 0.041015625, + "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, + "rewards/symbolic_reward_partial_score/mean": 0.5416829586029053, + "rewards/symbolic_reward_partial_score/std": 0.21756330132484436, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.038155198097229, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 3.4172184467315674, + "step": 709 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.23037738353013992, + "epoch": 1.1378205128205128, + "grad_norm": 0.028344059363007545, + "learning_rate": 1e-06, + "loss": 0.0167, + "step": 710 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.22201494872570038, + "epoch": 1.1394230769230769, + "grad_norm": 0.04986124113202095, + "learning_rate": 1e-06, + "loss": 0.0701, + "step": 711 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.22669429332017899, + "epoch": 1.141025641025641, + "grad_norm": 0.03080112673342228, + "learning_rate": 1e-06, + "loss": 0.0666, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5973.0, + "completions/mean_length": 3245.638671875, + "completions/mean_terminated_length": 3089.847900390625, + "completions/min_length": 517.0, + "completions/min_terminated_length": 517.0, + "entropy": 0.23609895259141922, + "epoch": 1.142628205128205, + "frac_reward_zero_std": 0.0625, + "grad_norm": 188.2184295654297, + "learning_rate": 1e-06, + "loss": 0.0473, + "num_tokens": 665642874.0, + "reward": 0.21525543928146362, + "reward_std": 0.0937977284193039, + "rewards/progression_diversity/mean": -0.000335338176228106, + "rewards/progression_diversity/std": 0.005407850258052349, + "rewards/symbolic_reward_accuracy/mean": 0.08203125, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.557373046875, + "rewards/symbolic_reward_partial_score/std": 0.2241562008857727, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0440669059753418, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.9209955930709839, + "step": 713 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23579012602567673, + "epoch": 1.1442307692307692, + "grad_norm": 0.031340159475803375, + "learning_rate": 1e-06, + "loss": -0.0103, + "step": 714 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23112714290618896, + "epoch": 1.1458333333333333, + "grad_norm": 0.03565249219536781, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 715 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2343195155262947, + "epoch": 1.1474358974358974, + "grad_norm": 0.03568951413035393, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5608.0, + "completions/mean_length": 3687.26171875, + "completions/mean_terminated_length": 3224.627685546875, + "completions/min_length": 512.0, + "completions/min_terminated_length": 512.0, + "entropy": 0.23093228042125702, + "epoch": 1.1490384615384615, + "frac_reward_zero_std": 0.03125, + "grad_norm": 25.515024185180664, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 668387632.0, + "reward": 0.1549118608236313, + "reward_std": 0.06852319091558456, + "rewards/progression_diversity/mean": -0.001002308912575245, + "rewards/progression_diversity/std": 0.015932733193039894, + "rewards/symbolic_reward_accuracy/mean": 0.015625, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.49687501788139343, + "rewards/symbolic_reward_partial_score/std": 0.20396628975868225, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041079044342041, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 1.6645785570144653, + "step": 717 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.22284479439258575, + "epoch": 1.1506410256410255, + "grad_norm": 0.03507667034864426, + "learning_rate": 1e-06, + "loss": 0.0831, + "step": 718 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2239360809326172, + "epoch": 1.1522435897435896, + "grad_norm": 0.03187626227736473, + "learning_rate": 1e-06, + "loss": 0.0759, + "step": 719 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2218421921133995, + "epoch": 1.1538461538461537, + "grad_norm": 0.05089287832379341, + "learning_rate": 1e-06, + "loss": 0.0547, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5998.0, + "completions/mean_length": 3646.466796875, + "completions/mean_terminated_length": 3314.627197265625, + "completions/min_length": 710.0, + "completions/min_terminated_length": 710.0, + "entropy": 0.2157435417175293, + "epoch": 1.155448717948718, + "frac_reward_zero_std": 0.0, + "grad_norm": 491.061767578125, + "learning_rate": 1e-06, + "loss": 0.0744, + "num_tokens": 671153135.0, + "reward": 0.16443172097206116, + "reward_std": 0.08347620069980621, + "rewards/progression_diversity/mean": -0.00018694471509661525, + "rewards/progression_diversity/std": 0.00423007644712925, + "rewards/symbolic_reward_accuracy/mean": 0.025390625, + "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, + "rewards/symbolic_reward_partial_score/mean": 0.5057942867279053, + "rewards/symbolic_reward_partial_score/std": 0.22736532986164093, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0419472455978394, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.7793669700622559, + "step": 721 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.23054082691669464, + "epoch": 1.157051282051282, + "grad_norm": 0.038431961089372635, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 722 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.22331822663545609, + "epoch": 1.1586538461538463, + "grad_norm": 0.030653027817606926, + "learning_rate": 1e-06, + "loss": 0.0586, + "step": 723 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.22740525007247925, + "epoch": 1.1602564102564104, + "grad_norm": 0.03620804473757744, + "learning_rate": 1e-06, + "loss": 0.0345, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5344.0, + "completions/mean_length": 3325.42578125, + "completions/mean_terminated_length": 3222.602294921875, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "entropy": 0.2329612523317337, + "epoch": 1.1618589743589745, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.042414553463459015, + "learning_rate": 1e-06, + "loss": 0.0642, + "num_tokens": 673598329.0, + "reward": 0.18591231107711792, + "reward_std": 0.08399780839681625, + "rewards/progression_diversity/mean": -7.887653919169679e-05, + "rewards/progression_diversity/std": 0.0017324578948318958, + "rewards/symbolic_reward_accuracy/mean": 0.048828125, + "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, + "rewards/symbolic_reward_partial_score/mean": 0.524658203125, + "rewards/symbolic_reward_partial_score/std": 0.23230838775634766, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0474889278411865, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 724.0, + "sampling/sampling_logp_difference/mean": 0.47188305854797363, + "step": 725 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23176733404397964, + "epoch": 1.1634615384615385, + "grad_norm": 61.60709762573242, + "learning_rate": 1e-06, + "loss": 0.0217, + "step": 726 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23428219556808472, + "epoch": 1.1650641025641026, + "grad_norm": 0.038381967693567276, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 727 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23280168324708939, + "epoch": 1.1666666666666667, + "grad_norm": 0.02906009368598461, + "learning_rate": 1e-06, + "loss": -0.0278, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4910.0, + "completions/mean_length": 3052.662109375, + "completions/mean_terminated_length": 2867.871337890625, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "entropy": 0.2368728667497635, + "epoch": 1.1682692307692308, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.03455471992492676, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 676083596.0, + "reward": 0.19645507633686066, + "reward_std": 0.09786352515220642, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.0625, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.5344075560569763, + "rewards/symbolic_reward_partial_score/std": 0.22556960582733154, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0460604429244995, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 0.4928211271762848, + "step": 729 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2318621650338173, + "epoch": 1.169871794871795, + "grad_norm": 0.04372532665729523, + "learning_rate": 1e-06, + "loss": -0.0078, + "step": 730 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23245908319950104, + "epoch": 1.171474358974359, + "grad_norm": 0.10342668741941452, + "learning_rate": 1e-06, + "loss": 0.0665, + "step": 731 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2263612002134323, + "epoch": 1.1730769230769231, + "grad_norm": 36.62001419067383, + "learning_rate": 1e-06, + "loss": 0.1059, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5687.0, + "completions/mean_length": 3134.9296875, + "completions/mean_terminated_length": 2951.279296875, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.23260176926851273, + "epoch": 1.1746794871794872, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.0437169186770916, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 678543112.0, + "reward": 0.1655300259590149, + "reward_std": 0.08380260318517685, + "rewards/progression_diversity/mean": -0.0007075904868543148, + "rewards/progression_diversity/std": 0.012582487426698208, + "rewards/symbolic_reward_accuracy/mean": 0.03515625, + "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, + "rewards/symbolic_reward_partial_score/mean": 0.48538413643836975, + "rewards/symbolic_reward_partial_score/std": 0.22206903994083405, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.042715311050415, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.9027316570281982, + "step": 733 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2210148423910141, + "epoch": 1.1762820512820513, + "grad_norm": 185.5412139892578, + "learning_rate": 1e-06, + "loss": 0.0454, + "step": 734 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.22284194827079773, + "epoch": 1.1778846153846154, + "grad_norm": 0.0355803407728672, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 735 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.22447482496500015, + "epoch": 1.1794871794871795, + "grad_norm": 0.03734675049781799, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5229.0, + "completions/mean_length": 2980.09765625, + "completions/mean_terminated_length": 2847.909423828125, + "completions/min_length": 523.0, + "completions/min_terminated_length": 523.0, + "entropy": 0.22639226913452148, + "epoch": 1.1810897435897436, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03936491161584854, + "learning_rate": 1e-06, + "loss": 0.0262, + "num_tokens": 681020794.0, + "reward": 0.19800294935703278, + "reward_std": 0.10915393382310867, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.052734375, + "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, + "rewards/symbolic_reward_partial_score/mean": 0.5577962398529053, + "rewards/symbolic_reward_partial_score/std": 0.2130732387304306, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.043833613395691, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 0.7718756794929504, + "step": 737 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.22041287273168564, + "epoch": 1.1826923076923077, + "grad_norm": 0.025883983820676804, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 738 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.22687563300132751, + "epoch": 1.1842948717948718, + "grad_norm": 0.030974598601460457, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 739 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.22128045558929443, + "epoch": 1.185897435897436, + "grad_norm": 0.05905633792281151, + "learning_rate": 1e-06, + "loss": 0.0396, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4864.0, + "completions/mean_length": 3153.501953125, + "completions/mean_terminated_length": 3023.023681640625, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "entropy": 0.23090147227048874, + "epoch": 1.1875, + "frac_reward_zero_std": 0.09375, + "grad_norm": 331.0104064941406, + "learning_rate": 1e-06, + "loss": 0.0263, + "num_tokens": 683450971.0, + "reward": 0.16851550340652466, + "reward_std": 0.07939761132001877, + "rewards/progression_diversity/mean": -0.000989372143521905, + "rewards/progression_diversity/std": 0.013076537288725376, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.48753252625465393, + "rewards/symbolic_reward_partial_score/std": 0.20784252882003784, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0429353713989258, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 2.1165027618408203, + "step": 741 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.22930524498224258, + "epoch": 1.189102564102564, + "grad_norm": 0.029941659420728683, + "learning_rate": 1e-06, + "loss": 0.041, + "step": 742 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22974687814712524, + "epoch": 1.1907051282051282, + "grad_norm": 0.025151744484901428, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 743 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.2339945062994957, + "epoch": 1.1923076923076923, + "grad_norm": 0.02276208996772766, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5222.0, + "completions/mean_length": 3041.041015625, + "completions/mean_terminated_length": 2909.45361328125, + "completions/min_length": 678.0, + "completions/min_terminated_length": 678.0, + "entropy": 0.23491770029067993, + "epoch": 1.1939102564102564, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.05074154585599899, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 685896720.0, + "reward": 0.21035856008529663, + "reward_std": 0.08227989077568054, + "rewards/progression_diversity/mean": -0.0007646906888112426, + "rewards/progression_diversity/std": 0.010030530393123627, + "rewards/symbolic_reward_accuracy/mean": 0.0625, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.5794759392738342, + "rewards/symbolic_reward_partial_score/std": 0.1977068930864334, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0442752838134766, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 2.015608072280884, + "step": 745 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2312600165605545, + "epoch": 1.1955128205128205, + "grad_norm": 0.03379121422767639, + "learning_rate": 1e-06, + "loss": 0.0464, + "step": 746 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2323080375790596, + "epoch": 1.1971153846153846, + "grad_norm": 0.02059108205139637, + "learning_rate": 1e-06, + "loss": 0.0359, + "step": 747 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2359512448310852, + "epoch": 1.1987179487179487, + "grad_norm": 0.026128700003027916, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5081.0, + "completions/mean_length": 3201.248046875, + "completions/mean_terminated_length": 2748.507080078125, + "completions/min_length": 579.0, + "completions/min_terminated_length": 579.0, + "entropy": 0.21788237243890762, + "epoch": 1.2003205128205128, + "frac_reward_zero_std": 0.03125, + "grad_norm": 229.65496826171875, + "learning_rate": 1e-06, + "loss": 0.0528, + "num_tokens": 688491791.0, + "reward": 0.17160627245903015, + "reward_std": 0.09746118634939194, + "rewards/progression_diversity/mean": -0.0005057294620200992, + "rewards/progression_diversity/std": 0.005976199172437191, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.5030273795127869, + "rewards/symbolic_reward_partial_score/std": 0.2094988375902176, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0365022420883179, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 3.8983547687530518, + "step": 749 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.23051826655864716, + "epoch": 1.2019230769230769, + "grad_norm": 0.03440074622631073, + "learning_rate": 1e-06, + "loss": 0.0177, + "step": 750 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.22294911742210388, + "epoch": 1.203525641025641, + "grad_norm": 0.028672844171524048, + "learning_rate": 1e-06, + "loss": 0.0646, + "step": 751 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2228538542985916, + "epoch": 1.205128205128205, + "grad_norm": 36.360984802246094, + "learning_rate": 1e-06, + "loss": 0.0626, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5348.0, + "completions/mean_length": 2809.646484375, + "completions/mean_terminated_length": 2729.640625, + "completions/min_length": 551.0, + "completions/min_terminated_length": 551.0, + "entropy": 0.23874470591545105, + "epoch": 1.2067307692307692, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.05420081317424774, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 690740314.0, + "reward": 0.1803320348262787, + "reward_std": 0.08125782012939453, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.041015625, + "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, + "rewards/symbolic_reward_partial_score/mean": 0.5210286378860474, + "rewards/symbolic_reward_partial_score/std": 0.19359782338142395, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0487172603607178, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 0.39224666357040405, + "step": 753 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2426346391439438, + "epoch": 1.2083333333333333, + "grad_norm": 106.4404296875, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 754 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.23718024045228958, + "epoch": 1.2099358974358974, + "grad_norm": 0.03360147029161453, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 755 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.24055323004722595, + "epoch": 1.2115384615384615, + "grad_norm": 0.028352908790111542, + "learning_rate": 1e-06, + "loss": -0.0133, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4704.0, + "completions/mean_length": 2882.4375, + "completions/mean_terminated_length": 2585.99609375, + "completions/min_length": 550.0, + "completions/min_terminated_length": 550.0, + "entropy": 0.2231486737728119, + "epoch": 1.2131410256410255, + "frac_reward_zero_std": 0.03125, + "grad_norm": 601.402587890625, + "learning_rate": 1e-06, + "loss": 0.0973, + "num_tokens": 693148362.0, + "reward": 0.25671058893203735, + "reward_std": 0.10707536339759827, + "rewards/progression_diversity/mean": -0.000815525883808732, + "rewards/progression_diversity/std": 0.010911921970546246, + "rewards/symbolic_reward_accuracy/mean": 0.134765625, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.5927083492279053, + "rewards/symbolic_reward_partial_score/std": 0.24597984552383423, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0398728847503662, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 3.051284074783325, + "step": 757 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.23185677826404572, + "epoch": 1.2147435897435896, + "grad_norm": 0.03557441011071205, + "learning_rate": 1e-06, + "loss": 0.0471, + "step": 758 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.23132619261741638, + "epoch": 1.2163461538461537, + "grad_norm": 0.018664143979549408, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 759 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2319474294781685, + "epoch": 1.217948717948718, + "grad_norm": 0.017118403688073158, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5218.0, + "completions/mean_length": 2768.322265625, + "completions/mean_terminated_length": 2606.87158203125, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.2300676926970482, + "epoch": 1.219551282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 52.74298095703125, + "learning_rate": 1e-06, + "loss": 0.0637, + "num_tokens": 695437359.0, + "reward": 0.18998649716377258, + "reward_std": 0.09169755131006241, + "rewards/progression_diversity/mean": -0.00037375889951363206, + "rewards/progression_diversity/std": 0.00622538710013032, + "rewards/symbolic_reward_accuracy/mean": 0.0546875, + "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, + "rewards/symbolic_reward_partial_score/mean": 0.52783203125, + "rewards/symbolic_reward_partial_score/std": 0.2111833095550537, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045241117477417, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.6731740236282349, + "step": 761 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.23460698127746582, + "epoch": 1.2211538461538463, + "grad_norm": 1694.8343505859375, + "learning_rate": 1e-06, + "loss": 0.1124, + "step": 762 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.24004460126161575, + "epoch": 1.2227564102564104, + "grad_norm": 0.038948699831962585, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 763 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.24492079764604568, + "epoch": 1.2243589743589745, + "grad_norm": 0.02253109961748123, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4993.0, + "completions/mean_length": 2854.958984375, + "completions/mean_terminated_length": 2748.43115234375, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "entropy": 0.2323237955570221, + "epoch": 1.2259615384615385, + "frac_reward_zero_std": 0.0, + "grad_norm": 1602.8712158203125, + "learning_rate": 1e-06, + "loss": 0.0338, + "num_tokens": 697774042.0, + "reward": 0.187491774559021, + "reward_std": 0.09734513610601425, + "rewards/progression_diversity/mean": -0.00033558663562871516, + "rewards/progression_diversity/std": 0.005632203537970781, + "rewards/symbolic_reward_accuracy/mean": 0.048828125, + "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, + "rewards/symbolic_reward_partial_score/mean": 0.5299316644668579, + "rewards/symbolic_reward_partial_score/std": 0.21817044913768768, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0454214811325073, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.2177436351776123, + "step": 765 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22870472818613052, + "epoch": 1.2275641025641026, + "grad_norm": 0.02481686696410179, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 766 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2385152205824852, + "epoch": 1.2291666666666667, + "grad_norm": 0.0412432886660099, + "learning_rate": 1e-06, + "loss": -0.0112, + "step": 767 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.23105958104133606, + "epoch": 1.2307692307692308, + "grad_norm": 0.04013615846633911, + "learning_rate": 1e-06, + "loss": 0.0401, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5212.0, + "completions/mean_length": 3133.908203125, + "completions/mean_terminated_length": 2896.828857421875, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "entropy": 0.2316986247897148, + "epoch": 1.232371794871795, + "frac_reward_zero_std": 0.03125, + "grad_norm": 384.80572509765625, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 700265723.0, + "reward": 0.19464799761772156, + "reward_std": 0.07056570053100586, + "rewards/progression_diversity/mean": -0.0005334917223080993, + "rewards/progression_diversity/std": 0.00818613264709711, + "rewards/symbolic_reward_accuracy/mean": 0.044921875, + "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, + "rewards/symbolic_reward_partial_score/mean": 0.5648600459098816, + "rewards/symbolic_reward_partial_score/std": 0.20340082049369812, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.042616367340088, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 2.089385509490967, + "step": 769 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.23712345212697983, + "epoch": 1.233974358974359, + "grad_norm": 0.03780589625239372, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 770 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2268049567937851, + "epoch": 1.2355769230769231, + "grad_norm": 0.028598081320524216, + "learning_rate": 1e-06, + "loss": 0.0633, + "step": 771 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.22704008221626282, + "epoch": 1.2371794871794872, + "grad_norm": 0.036604560911655426, + "learning_rate": 1e-06, + "loss": 0.0556, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5111.0, + "completions/mean_length": 2862.83203125, + "completions/mean_terminated_length": 2729.4873046875, + "completions/min_length": 484.0, + "completions/min_terminated_length": 484.0, + "entropy": 0.23467671126127243, + "epoch": 1.2387820512820513, + "frac_reward_zero_std": 0.0625, + "grad_norm": 248.04339599609375, + "learning_rate": 1e-06, + "loss": 0.0378, + "num_tokens": 702571989.0, + "reward": 0.16636018455028534, + "reward_std": 0.07103414833545685, + "rewards/progression_diversity/mean": -0.00021245863172225654, + "rewards/progression_diversity/std": 0.004727643448859453, + "rewards/symbolic_reward_accuracy/mean": 0.017578125, + "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, + "rewards/symbolic_reward_partial_score/mean": 0.5232909917831421, + "rewards/symbolic_reward_partial_score/std": 0.18974992632865906, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0468013286590576, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.2262119054794312, + "step": 773 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2362857237458229, + "epoch": 1.2403846153846154, + "grad_norm": 0.048055924475193024, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 774 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.23780542612075806, + "epoch": 1.2419871794871795, + "grad_norm": 0.02682042494416237, + "learning_rate": 1e-06, + "loss": -0.0151, + "step": 775 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2368232011795044, + "epoch": 1.2435897435897436, + "grad_norm": 0.09740027785301208, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5104.0, + "completions/mean_length": 2685.009765625, + "completions/mean_terminated_length": 2577.143798828125, + "completions/min_length": 489.0, + "completions/min_terminated_length": 489.0, + "entropy": 0.2424686774611473, + "epoch": 1.2451923076923077, + "frac_reward_zero_std": 0.03125, + "grad_norm": 399.4440612792969, + "learning_rate": 1e-06, + "loss": 0.0333, + "num_tokens": 704759194.0, + "reward": 0.1672852635383606, + "reward_std": 0.0681874230504036, + "rewards/progression_diversity/mean": -0.0009661708609201014, + "rewards/progression_diversity/std": 0.01556091196835041, + "rewards/symbolic_reward_accuracy/mean": 0.025390625, + "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, + "rewards/symbolic_reward_partial_score/mean": 0.5088216066360474, + "rewards/symbolic_reward_partial_score/std": 0.19267694652080536, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0457208156585693, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 2.5860910415649414, + "step": 777 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.24405201524496078, + "epoch": 1.2467948717948718, + "grad_norm": 0.040985848754644394, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 778 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.24193061143159866, + "epoch": 1.248397435897436, + "grad_norm": 0.029141481965780258, + "learning_rate": 1e-06, + "loss": 0.0323, + "step": 779 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.24474172294139862, + "epoch": 1.25, + "grad_norm": 0.03860503062605858, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4847.0, + "completions/mean_length": 2609.923828125, + "completions/mean_terminated_length": 2555.907958984375, + "completions/min_length": 534.0, + "completions/min_terminated_length": 534.0, + "entropy": 0.23170283436775208, + "epoch": 1.251602564102564, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.0442928783595562, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 707031571.0, + "reward": 0.18274357914924622, + "reward_std": 0.06679178774356842, + "rewards/progression_diversity/mean": -0.0005459238309413195, + "rewards/progression_diversity/std": 0.008116460405290127, + "rewards/symbolic_reward_accuracy/mean": 0.0234375, + "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, + "rewards/symbolic_reward_partial_score/mean": 0.5635905265808105, + "rewards/symbolic_reward_partial_score/std": 0.20824402570724487, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045533299446106, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.4784348011016846, + "step": 781 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.23342838883399963, + "epoch": 1.2532051282051282, + "grad_norm": 0.03521363064646721, + "learning_rate": 1e-06, + "loss": 0.0218, + "step": 782 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2293540984392166, + "epoch": 1.2548076923076923, + "grad_norm": 0.028306540101766586, + "learning_rate": 1e-06, + "loss": -0.0201, + "step": 783 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.22597584128379822, + "epoch": 1.2564102564102564, + "grad_norm": 0.024497980251908302, + "learning_rate": 1e-06, + "loss": 0.0313, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4343.0, + "completions/mean_length": 2555.701171875, + "completions/mean_terminated_length": 2474.198486328125, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.22865238785743713, + "epoch": 1.2580128205128205, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.03943171352148056, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 709195418.0, + "reward": 0.193578839302063, + "reward_std": 0.087828129529953, + "rewards/progression_diversity/mean": -2.721707096497994e-05, + "rewards/progression_diversity/std": 0.0006158520118333399, + "rewards/symbolic_reward_accuracy/mean": 0.048828125, + "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, + "rewards/symbolic_reward_partial_score/mean": 0.549560546875, + "rewards/symbolic_reward_partial_score/std": 0.19566665589809418, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045490026473999, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.5368279218673706, + "step": 785 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.22690270096063614, + "epoch": 1.2596153846153846, + "grad_norm": 0.0247288029640913, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 786 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2224835455417633, + "epoch": 1.2612179487179487, + "grad_norm": 0.03503105789422989, + "learning_rate": 1e-06, + "loss": 0.0801, + "step": 787 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22508803755044937, + "epoch": 1.2628205128205128, + "grad_norm": 0.033731210976839066, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4441.0, + "completions/mean_length": 2418.279296875, + "completions/mean_terminated_length": 2280.55029296875, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.2210671380162239, + "epoch": 1.2644230769230769, + "frac_reward_zero_std": 0.0625, + "grad_norm": 72.21678924560547, + "learning_rate": 1e-06, + "loss": 0.0407, + "num_tokens": 711362041.0, + "reward": 0.17944911122322083, + "reward_std": 0.08195337653160095, + "rewards/progression_diversity/mean": -0.00040262757102027535, + "rewards/progression_diversity/std": 0.006931956857442856, + "rewards/symbolic_reward_accuracy/mean": 0.03125, + "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, + "rewards/symbolic_reward_partial_score/mean": 0.5389323234558105, + "rewards/symbolic_reward_partial_score/std": 0.18118998408317566, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0432648658752441, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.1448458433151245, + "step": 789 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.22485744953155518, + "epoch": 1.266025641025641, + "grad_norm": 0.02760397456586361, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 790 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.22400467097759247, + "epoch": 1.267628205128205, + "grad_norm": 0.031739212572574615, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 791 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2254156917333603, + "epoch": 1.2692307692307692, + "grad_norm": 0.03261100500822067, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4488.0, + "completions/mean_length": 2580.109375, + "completions/mean_terminated_length": 2388.768310546875, + "completions/min_length": 580.0, + "completions/min_terminated_length": 580.0, + "entropy": 0.2238694727420807, + "epoch": 1.2708333333333333, + "frac_reward_zero_std": 0.03125, + "grad_norm": 277.4549255371094, + "learning_rate": 1e-06, + "loss": 0.053, + "num_tokens": 713579937.0, + "reward": 0.18201328814029694, + "reward_std": 0.0664268359541893, + "rewards/progression_diversity/mean": -0.0013090292923152447, + "rewards/progression_diversity/std": 0.013768412172794342, + "rewards/symbolic_reward_accuracy/mean": 0.021484375, + "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, + "rewards/symbolic_reward_partial_score/mean": 0.5670410394668579, + "rewards/symbolic_reward_partial_score/std": 0.1917123794555664, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0389268398284912, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 2.950878143310547, + "step": 793 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2173541858792305, + "epoch": 1.2724358974358974, + "grad_norm": 745.087890625, + "learning_rate": 1e-06, + "loss": 0.0668, + "step": 794 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.22230473905801773, + "epoch": 1.2740384615384617, + "grad_norm": 0.03551755100488663, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 795 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.22595015913248062, + "epoch": 1.2756410256410255, + "grad_norm": 0.025852041319012642, + "learning_rate": 1e-06, + "loss": -0.0077, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4958.0, + "completions/mean_length": 2487.400390625, + "completions/mean_terminated_length": 2405.4951171875, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 0.22661074995994568, + "epoch": 1.2772435897435899, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.04282950237393379, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 715731470.0, + "reward": 0.24059391021728516, + "reward_std": 0.08692439645528793, + "rewards/progression_diversity/mean": -0.00017893135373014957, + "rewards/progression_diversity/std": 0.0020902305841445923, + "rewards/symbolic_reward_accuracy/mean": 0.10546875, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.5930013060569763, + "rewards/symbolic_reward_partial_score/std": 0.2335280328989029, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.043853759765625, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.3285684883594513, + "step": 797 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2201601192355156, + "epoch": 1.2788461538461537, + "grad_norm": 0.03756594657897949, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 798 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2165718972682953, + "epoch": 1.280448717948718, + "grad_norm": 0.02026909776031971, + "learning_rate": 1e-06, + "loss": 0.0363, + "step": 799 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2138141244649887, + "epoch": 1.282051282051282, + "grad_norm": 0.021396497264504433, + "learning_rate": 1e-06, + "loss": 0.0612, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5352.0, + "completions/mean_length": 2308.837890625, + "completions/mean_terminated_length": 2198.009765625, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.2189565673470497, + "epoch": 1.2836538461538463, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.04113735631108284, + "learning_rate": 1e-06, + "loss": -0.0137, + "num_tokens": 717814763.0, + "reward": 0.18618622422218323, + "reward_std": 0.06758680939674377, + "rewards/progression_diversity/mean": -0.0005197992431931198, + "rewards/progression_diversity/std": 0.008483768440783024, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.545117199420929, + "rewards/symbolic_reward_partial_score/std": 0.2002282440662384, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0427607297897339, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.610470175743103, + "step": 801 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.21421056240797043, + "epoch": 1.2852564102564101, + "grad_norm": 0.02349807135760784, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 802 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.21766190230846405, + "epoch": 1.2868589743589745, + "grad_norm": 0.024225391447544098, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 803 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.22310616821050644, + "epoch": 1.2884615384615383, + "grad_norm": 0.05020375922322273, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4344.0, + "completions/max_terminated_length": 4344.0, + "completions/mean_length": 2237.517578125, + "completions/mean_terminated_length": 2237.517578125, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.22092020511627197, + "epoch": 1.2900641025641026, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.034722164273262024, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 719703236.0, + "reward": 0.24485130608081818, + "reward_std": 0.12719884514808655, + "rewards/progression_diversity/mean": -0.00022179064399097115, + "rewards/progression_diversity/std": 0.003463014727458358, + "rewards/symbolic_reward_accuracy/mean": 0.11328125, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.5896158814430237, + "rewards/symbolic_reward_partial_score/std": 0.21620358526706696, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0438671112060547, + "sampling/importance_sampling_ratio/min": 1.0395459602315473e-15, + "sampling/sampling_logp_difference/max": 34.49999237060547, + "sampling/sampling_logp_difference/mean": 0.09283532202243805, + "step": 805 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.22089938074350357, + "epoch": 1.2916666666666667, + "grad_norm": 0.027210041880607605, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 806 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.21874121576547623, + "epoch": 1.2932692307692308, + "grad_norm": 0.024275539442896843, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 807 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.34375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.21499209105968475, + "epoch": 1.294871794871795, + "grad_norm": 0.03609732910990715, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4430.0, + "completions/mean_length": 2284.5625, + "completions/mean_terminated_length": 2173.543212890625, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "entropy": 0.21107368171215057, + "epoch": 1.296474358974359, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.03534679487347603, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 721772084.0, + "reward": 0.1930440068244934, + "reward_std": 0.0763799324631691, + "rewards/progression_diversity/mean": -0.0007751868688501418, + "rewards/progression_diversity/std": 0.01335845235735178, + "rewards/symbolic_reward_accuracy/mean": 0.046875, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.5523600578308105, + "rewards/symbolic_reward_partial_score/std": 0.20780852437019348, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.040164828300476, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.9508891105651855, + "step": 809 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.21084369719028473, + "epoch": 1.2980769230769231, + "grad_norm": 0.031098656356334686, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 810 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2036626636981964, + "epoch": 1.2996794871794872, + "grad_norm": 0.04239289462566376, + "learning_rate": 1e-06, + "loss": 0.0488, + "step": 811 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.20902617275714874, + "epoch": 1.3012820512820513, + "grad_norm": 0.03809356689453125, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5018.0, + "completions/mean_length": 2127.1875, + "completions/mean_terminated_length": 1986.5877685546875, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "entropy": 0.20260044187307358, + "epoch": 1.3028846153846154, + "frac_reward_zero_std": 0.03125, + "grad_norm": 216.08346557617188, + "learning_rate": 1e-06, + "loss": 0.0261, + "num_tokens": 723870292.0, + "reward": 0.17801672220230103, + "reward_std": 0.07631928473711014, + "rewards/progression_diversity/mean": -8.632877870695665e-05, + "rewards/progression_diversity/std": 0.0017233239486813545, + "rewards/symbolic_reward_accuracy/mean": 0.0234375, + "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, + "rewards/symbolic_reward_partial_score/mean": 0.5497721433639526, + "rewards/symbolic_reward_partial_score/std": 0.19805040955543518, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.038082242012024, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.1360535621643066, + "step": 813 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.20947977900505066, + "epoch": 1.3044871794871795, + "grad_norm": 0.03403943032026291, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 814 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.19642490148544312, + "epoch": 1.3060897435897436, + "grad_norm": 0.020816028118133545, + "learning_rate": 1e-06, + "loss": 0.0554, + "step": 815 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.20714019238948822, + "epoch": 1.3076923076923077, + "grad_norm": 0.02245701290667057, + "learning_rate": 1e-06, + "loss": -0.0076, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4542.0, + "completions/mean_length": 2476.328125, + "completions/mean_terminated_length": 2339.171630859375, + "completions/min_length": 549.0, + "completions/min_terminated_length": 549.0, + "entropy": 0.20043767243623734, + "epoch": 1.3092948717948718, + "frac_reward_zero_std": 0.03125, + "grad_norm": 203.6943817138672, + "learning_rate": 1e-06, + "loss": 0.0633, + "num_tokens": 725929244.0, + "reward": 0.2166806012392044, + "reward_std": 0.10392580181360245, + "rewards/progression_diversity/mean": -0.0003975919389631599, + "rewards/progression_diversity/std": 0.004248625598847866, + "rewards/symbolic_reward_accuracy/mean": 0.087890625, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.549755871295929, + "rewards/symbolic_reward_partial_score/std": 0.21964266896247864, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.03891921043396, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.3425063192844391, + "step": 817 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2018059492111206, + "epoch": 1.310897435897436, + "grad_norm": 0.02881396934390068, + "learning_rate": 1e-06, + "loss": 0.0388, + "step": 818 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.20520929992198944, + "epoch": 1.3125, + "grad_norm": 0.029828235507011414, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 819 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.20675267279148102, + "epoch": 1.314102564102564, + "grad_norm": 0.02294883131980896, + "learning_rate": 1e-06, + "loss": -0.0073, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4985.0, + "completions/mean_length": 2255.87109375, + "completions/mean_terminated_length": 2228.22314453125, + "completions/min_length": 510.0, + "completions/min_terminated_length": 510.0, + "entropy": 0.20751511305570602, + "epoch": 1.3157051282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03021460585296154, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 727907322.0, + "reward": 0.21369871497154236, + "reward_std": 0.09200935065746307, + "rewards/progression_diversity/mean": -0.00024685371317900717, + "rewards/progression_diversity/std": 0.004451336804777384, + "rewards/symbolic_reward_accuracy/mean": 0.078125, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.5567382574081421, + "rewards/symbolic_reward_partial_score/std": 0.222740039229393, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.040090799331665, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 0.3165525197982788, + "step": 821 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.20631252229213715, + "epoch": 1.3173076923076923, + "grad_norm": 0.021766817197203636, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 822 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.20735712349414825, + "epoch": 1.3189102564102564, + "grad_norm": 0.024145064875483513, + "learning_rate": 1e-06, + "loss": 0.0377, + "step": 823 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.20985466986894608, + "epoch": 1.3205128205128205, + "grad_norm": 0.030014174059033394, + "learning_rate": 1e-06, + "loss": -0.0084, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4514.0, + "completions/mean_length": 2213.6015625, + "completions/mean_terminated_length": 2130.08251953125, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.21330777555704117, + "epoch": 1.3221153846153846, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.032049596309661865, + "learning_rate": 1e-06, + "loss": 0.0345, + "num_tokens": 729936686.0, + "reward": 0.18624354898929596, + "reward_std": 0.09241214394569397, + "rewards/progression_diversity/mean": -0.000645598629489541, + "rewards/progression_diversity/std": 0.011169749312102795, + "rewards/symbolic_reward_accuracy/mean": 0.0546875, + "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, + "rewards/symbolic_reward_partial_score/mean": 0.5134114027023315, + "rewards/symbolic_reward_partial_score/std": 0.2185731679201126, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0392979383468628, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.1306374073028564, + "step": 825 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.20698269456624985, + "epoch": 1.3237179487179487, + "grad_norm": 382.0038757324219, + "learning_rate": 1e-06, + "loss": 0.04, + "step": 826 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2093319445848465, + "epoch": 1.3253205128205128, + "grad_norm": 0.02666737698018551, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 827 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2111140564084053, + "epoch": 1.3269230769230769, + "grad_norm": 172.5316925048828, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4570.0, + "completions/mean_length": 2048.4921875, + "completions/mean_terminated_length": 2020.4383544921875, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 0.22205299139022827, + "epoch": 1.328525641025641, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.03405119851231575, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 731851850.0, + "reward": 0.20125804841518402, + "reward_std": 0.0859595537185669, + "rewards/progression_diversity/mean": -0.00017142272554337978, + "rewards/progression_diversity/std": 0.0027693838346749544, + "rewards/symbolic_reward_accuracy/mean": 0.056640625, + "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, + "rewards/symbolic_reward_partial_score/mean": 0.5582357048988342, + "rewards/symbolic_reward_partial_score/std": 0.21341325342655182, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0437791347503662, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.45466187596321106, + "step": 829 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.22545113414525986, + "epoch": 1.330128205128205, + "grad_norm": 0.03629428148269653, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 830 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.22470547258853912, + "epoch": 1.3317307692307692, + "grad_norm": 0.02873636782169342, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 831 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2245195508003235, + "epoch": 1.3333333333333333, + "grad_norm": 0.027126438915729523, + "learning_rate": 1e-06, + "loss": 0.0426, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8032.0, + "completions/mean_length": 1912.681640625, + "completions/mean_terminated_length": 1884.362060546875, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.2240256890654564, + "epoch": 1.3349358974358974, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.03682028129696846, + "learning_rate": 1e-06, + "loss": 0.0219, + "num_tokens": 733749047.0, + "reward": 0.20436623692512512, + "reward_std": 0.07025286555290222, + "rewards/progression_diversity/mean": -0.0003872342931572348, + "rewards/progression_diversity/std": 0.005586910527199507, + "rewards/symbolic_reward_accuracy/mean": 0.048828125, + "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, + "rewards/symbolic_reward_partial_score/mean": 0.584228515625, + "rewards/symbolic_reward_partial_score/std": 0.201664999127388, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0452286005020142, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.28562289476394653, + "step": 833 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.22573984414339066, + "epoch": 1.3365384615384617, + "grad_norm": 0.023644646629691124, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 834 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2348889708518982, + "epoch": 1.3381410256410255, + "grad_norm": 0.0235019288957119, + "learning_rate": 1e-06, + "loss": -0.0061, + "step": 835 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2224627062678337, + "epoch": 1.3397435897435899, + "grad_norm": 0.029762422665953636, + "learning_rate": 1e-06, + "loss": 0.0313, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4299.0, + "completions/max_terminated_length": 4299.0, + "completions/mean_length": 1805.880859375, + "completions/mean_terminated_length": 1805.880859375, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "entropy": 0.23368220031261444, + "epoch": 1.3413461538461537, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.034925542771816254, + "learning_rate": 1e-06, + "loss": 0.0211, + "num_tokens": 735455482.0, + "reward": 0.2690373361110687, + "reward_std": 0.11118809878826141, + "rewards/progression_diversity/mean": -7.453490979969501e-05, + "rewards/progression_diversity/std": 0.0010274524101987481, + "rewards/symbolic_reward_accuracy/mean": 0.130859375, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.6350748538970947, + "rewards/symbolic_reward_partial_score/std": 0.2342989295721054, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.046823501586914, + "sampling/importance_sampling_ratio/min": 0.0010576416971161962, + "sampling/sampling_logp_difference/max": 6.85171365737915, + "sampling/sampling_logp_difference/mean": 0.0981544554233551, + "step": 837 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.23005392402410507, + "epoch": 1.342948717948718, + "grad_norm": 0.02568940818309784, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 838 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.22952641546726227, + "epoch": 1.344551282051282, + "grad_norm": 0.030425578355789185, + "learning_rate": 1e-06, + "loss": -0.0307, + "step": 839 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.22608061879873276, + "epoch": 1.3461538461538463, + "grad_norm": 0.03146536648273468, + "learning_rate": 1e-06, + "loss": 0.042, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4922.0, + "completions/mean_length": 1662.537109375, + "completions/mean_terminated_length": 1604.8060302734375, + "completions/min_length": 464.0, + "completions/min_terminated_length": 464.0, + "entropy": 0.23019438982009888, + "epoch": 1.3477564102564101, + "frac_reward_zero_std": 0.0, + "grad_norm": 86.43042755126953, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 737145805.0, + "reward": 0.24045243859291077, + "reward_std": 0.12287962436676025, + "rewards/progression_diversity/mean": -0.0006555759464390576, + "rewards/progression_diversity/std": 0.009774475358426571, + "rewards/symbolic_reward_accuracy/mean": 0.099609375, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.6036132574081421, + "rewards/symbolic_reward_partial_score/std": 0.22525496780872345, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.043199896812439, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 2.1184022426605225, + "step": 841 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2274625524878502, + "epoch": 1.3493589743589745, + "grad_norm": 0.033875465393066406, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 842 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2375042364001274, + "epoch": 1.3509615384615383, + "grad_norm": 0.02767942100763321, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 843 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2300783023238182, + "epoch": 1.3525641025641026, + "grad_norm": 0.01807348243892193, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4712.0, + "completions/mean_length": 1617.0, + "completions/mean_terminated_length": 1559.09033203125, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.22667521983385086, + "epoch": 1.3541666666666667, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.036902934312820435, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 738841725.0, + "reward": 0.2651147246360779, + "reward_std": 0.10891143232584, + "rewards/progression_diversity/mean": -0.00024866661988198757, + "rewards/progression_diversity/std": 0.002786832395941019, + "rewards/symbolic_reward_accuracy/mean": 0.126953125, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.6311197280883789, + "rewards/symbolic_reward_partial_score/std": 0.21445629000663757, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0440601110458374, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.6725925803184509, + "step": 845 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.22537444531917572, + "epoch": 1.3557692307692308, + "grad_norm": 0.02038760855793953, + "learning_rate": 1e-06, + "loss": 0.0638, + "step": 846 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2280193567276001, + "epoch": 1.357371794871795, + "grad_norm": 0.024952072650194168, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 847 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.23109513521194458, + "epoch": 1.358974358974359, + "grad_norm": 0.020898301154375076, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4054.0, + "completions/max_terminated_length": 4054.0, + "completions/mean_length": 1489.037109375, + "completions/mean_terminated_length": 1489.037109375, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.2277429699897766, + "epoch": 1.3605769230769231, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.028738927096128464, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 740499376.0, + "reward": 0.2145245224237442, + "reward_std": 0.07948393374681473, + "rewards/progression_diversity/mean": -0.0001856325543485582, + "rewards/progression_diversity/std": 0.002533870516344905, + "rewards/symbolic_reward_accuracy/mean": 0.060546875, + "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, + "rewards/symbolic_reward_partial_score/mean": 0.593994140625, + "rewards/symbolic_reward_partial_score/std": 0.20968253910541534, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045418381690979, + "sampling/importance_sampling_ratio/min": 6.543979334111549e-12, + "sampling/sampling_logp_difference/max": 25.75247573852539, + "sampling/sampling_logp_difference/mean": 0.09526436775922775, + "step": 849 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.22307077050209045, + "epoch": 1.3621794871794872, + "grad_norm": 0.025526469573378563, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 850 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.22553226351737976, + "epoch": 1.3637820512820513, + "grad_norm": 0.02305307425558567, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 851 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.21545638144016266, + "epoch": 1.3653846153846154, + "grad_norm": 0.02928796596825123, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3814.0, + "completions/mean_length": 1567.021484375, + "completions/mean_terminated_length": 1538.025390625, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.21113787591457367, + "epoch": 1.3669871794871795, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.04044374078512192, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 742260011.0, + "reward": 0.17563673853874207, + "reward_std": 0.04823887348175049, + "rewards/progression_diversity/mean": -0.00029063018155284226, + "rewards/progression_diversity/std": 0.00633732695132494, + "rewards/symbolic_reward_accuracy/mean": 0.009765625, + "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, + "rewards/symbolic_reward_partial_score/mean": 0.5659342408180237, + "rewards/symbolic_reward_partial_score/std": 0.18250161409378052, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0408705472946167, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.9602516889572144, + "step": 853 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.218083955347538, + "epoch": 1.3685897435897436, + "grad_norm": 0.021193062886595726, + "learning_rate": 1e-06, + "loss": -0.0058, + "step": 854 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.20679837465286255, + "epoch": 1.3701923076923077, + "grad_norm": 1588.3087158203125, + "learning_rate": 1e-06, + "loss": 0.0694, + "step": 855 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.20584507286548615, + "epoch": 1.3717948717948718, + "grad_norm": 0.02309674583375454, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4411.0, + "completions/max_terminated_length": 4411.0, + "completions/mean_length": 1414.36328125, + "completions/mean_terminated_length": 1414.36328125, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.21835298836231232, + "epoch": 1.373397435897436, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.026670735329389572, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 743893157.0, + "reward": 0.23589275777339935, + "reward_std": 0.11220350861549377, + "rewards/progression_diversity/mean": -8.082183921942487e-05, + "rewards/progression_diversity/std": 0.0013526254333555698, + "rewards/symbolic_reward_accuracy/mean": 0.1015625, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.5831868648529053, + "rewards/symbolic_reward_partial_score/std": 0.22691193222999573, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0431017875671387, + "sampling/importance_sampling_ratio/min": 2.427903382340446e-05, + "sampling/sampling_logp_difference/max": 10.625897407531738, + "sampling/sampling_logp_difference/mean": 0.0924476757645607, + "step": 857 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.22078397125005722, + "epoch": 1.375, + "grad_norm": 0.017304185777902603, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 858 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.21548201143741608, + "epoch": 1.376602564102564, + "grad_norm": 0.017954610288143158, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 859 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.21115773171186447, + "epoch": 1.3782051282051282, + "grad_norm": 0.02831229567527771, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3687.0, + "completions/mean_length": 1555.4765625, + "completions/mean_terminated_length": 1526.4578857421875, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "entropy": 0.2089003250002861, + "epoch": 1.3798076923076923, + "frac_reward_zero_std": 0.03125, + "grad_norm": 104.46712493896484, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 745612857.0, + "reward": 0.18778178095817566, + "reward_std": 0.06140269339084625, + "rewards/progression_diversity/mean": -0.0006311247125267982, + "rewards/progression_diversity/std": 0.005138562526553869, + "rewards/symbolic_reward_accuracy/mean": 0.015625, + "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, + "rewards/symbolic_reward_partial_score/mean": 0.5953613519668579, + "rewards/symbolic_reward_partial_score/std": 0.1708049327135086, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0414655208587646, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 724.0, + "sampling/sampling_logp_difference/mean": 0.20548060536384583, + "step": 861 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2088572382926941, + "epoch": 1.3814102564102564, + "grad_norm": 0.025014329701662064, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 862 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.21202120184898376, + "epoch": 1.3830128205128205, + "grad_norm": 0.028442082926630974, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 863 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.21458961814641953, + "epoch": 1.3846153846153846, + "grad_norm": 0.02302766777575016, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4167.0, + "completions/mean_length": 1564.603515625, + "completions/mean_terminated_length": 1477.2593994140625, + "completions/min_length": 416.0, + "completions/min_terminated_length": 416.0, + "entropy": 0.2116372287273407, + "epoch": 1.3862179487179487, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.032430339604616165, + "learning_rate": 1e-06, + "loss": 0.053, + "num_tokens": 747279262.0, + "reward": 0.2634183168411255, + "reward_std": 0.09794985502958298, + "rewards/progression_diversity/mean": -0.00045274931471794844, + "rewards/progression_diversity/std": 0.008904572576284409, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.6300293207168579, + "rewards/symbolic_reward_partial_score/std": 0.21683233976364136, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0390100479125977, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.8159689903259277, + "step": 865 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.22469650208950043, + "epoch": 1.3878205128205128, + "grad_norm": 0.018467538058757782, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 866 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.22149597853422165, + "epoch": 1.3894230769230769, + "grad_norm": 0.023728886619210243, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 867 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.22125183045864105, + "epoch": 1.391025641025641, + "grad_norm": 0.019297420978546143, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3547.0, + "completions/mean_length": 1396.44921875, + "completions/mean_terminated_length": 1367.119384765625, + "completions/min_length": 462.0, + "completions/min_terminated_length": 462.0, + "entropy": 0.23219949007034302, + "epoch": 1.392628205128205, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.035795051604509354, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 748746020.0, + "reward": 0.22064433991909027, + "reward_std": 0.07329108566045761, + "rewards/progression_diversity/mean": -0.0009962395997717977, + "rewards/progression_diversity/std": 0.018096894025802612, + "rewards/symbolic_reward_accuracy/mean": 0.052734375, + "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, + "rewards/symbolic_reward_partial_score/mean": 0.6306965351104736, + "rewards/symbolic_reward_partial_score/std": 0.2060777246952057, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0450772047042847, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.2045083045959473, + "step": 869 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23559828102588654, + "epoch": 1.3942307692307692, + "grad_norm": 0.01611843891441822, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 870 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.23359952867031097, + "epoch": 1.3958333333333333, + "grad_norm": 0.02076306752860546, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 871 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2402142956852913, + "epoch": 1.3974358974358974, + "grad_norm": 0.017290132120251656, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3703.0, + "completions/mean_length": 1506.962890625, + "completions/mean_terminated_length": 1477.8492431640625, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.22817867249250412, + "epoch": 1.3990384615384617, + "frac_reward_zero_std": 0.125, + "grad_norm": 61.729652404785156, + "learning_rate": 1e-06, + "loss": 0.028, + "num_tokens": 750412865.0, + "reward": 0.23350869119167328, + "reward_std": 0.08862544596195221, + "rewards/progression_diversity/mean": -0.00020658165158238262, + "rewards/progression_diversity/std": 0.0025547959376126528, + "rewards/symbolic_reward_accuracy/mean": 0.08984375, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.5993326902389526, + "rewards/symbolic_reward_partial_score/std": 0.23785848915576935, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045426368713379, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 712.0, + "sampling/sampling_logp_difference/mean": 0.20375153422355652, + "step": 873 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.22932185232639313, + "epoch": 1.4006410256410255, + "grad_norm": 0.02397080697119236, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 874 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.229917511343956, + "epoch": 1.4022435897435899, + "grad_norm": 0.028196433559060097, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 875 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2285906821489334, + "epoch": 1.4038461538461537, + "grad_norm": 0.021019669249653816, + "learning_rate": 1e-06, + "loss": -0.0085, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4288.0, + "completions/max_terminated_length": 4288.0, + "completions/mean_length": 1584.255859375, + "completions/mean_terminated_length": 1584.255859375, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "entropy": 0.22845402359962463, + "epoch": 1.405448717948718, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.1292523443698883, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 752032676.0, + "reward": 0.21584956347942352, + "reward_std": 0.0788746029138565, + "rewards/progression_diversity/mean": -0.0004929413553327322, + "rewards/progression_diversity/std": 0.0036183472257107496, + "rewards/symbolic_reward_accuracy/mean": 0.0625, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.5945149660110474, + "rewards/symbolic_reward_partial_score/std": 0.20550201833248138, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0462896823883057, + "sampling/importance_sampling_ratio/min": 0.0012942898320034146, + "sampling/sampling_logp_difference/max": 6.6497931480407715, + "sampling/sampling_logp_difference/mean": 0.09629081189632416, + "step": 877 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2279265597462654, + "epoch": 1.407051282051282, + "grad_norm": 0.019477983936667442, + "learning_rate": 1e-06, + "loss": -0.0207, + "step": 878 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.22508171200752258, + "epoch": 1.4086538461538463, + "grad_norm": 0.029449041932821274, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 879 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2288508340716362, + "epoch": 1.4102564102564101, + "grad_norm": 0.022348174825310707, + "learning_rate": 1e-06, + "loss": 0.0093, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4524.0, + "completions/mean_length": 1608.572265625, + "completions/mean_terminated_length": 1579.657470703125, + "completions/min_length": 518.0, + "completions/min_terminated_length": 518.0, + "entropy": 0.22652538120746613, + "epoch": 1.4118589743589745, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.0312635712325573, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 753635801.0, + "reward": 0.23253975808620453, + "reward_std": 0.08958201110363007, + "rewards/progression_diversity/mean": -0.0004195784858893603, + "rewards/progression_diversity/std": 0.008441566489636898, + "rewards/symbolic_reward_accuracy/mean": 0.08984375, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.5961099863052368, + "rewards/symbolic_reward_partial_score/std": 0.23455365002155304, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0431697368621826, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.380308985710144, + "step": 881 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.22486486285924911, + "epoch": 1.4134615384615383, + "grad_norm": 0.023399055004119873, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 882 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.22499487549066544, + "epoch": 1.4150641025641026, + "grad_norm": 386.1463317871094, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 883 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.23033517599105835, + "epoch": 1.4166666666666667, + "grad_norm": 0.02022228017449379, + "learning_rate": 1e-06, + "loss": 0.0148, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4005.0, + "completions/mean_length": 1440.396484375, + "completions/mean_terminated_length": 1411.152587890625, + "completions/min_length": 492.0, + "completions/min_terminated_length": 492.0, + "entropy": 0.2310120165348053, + "epoch": 1.4182692307692308, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.029161928221583366, + "learning_rate": 1e-06, + "loss": 0.0218, + "num_tokens": 755244068.0, + "reward": 0.27608346939086914, + "reward_std": 0.10152895748615265, + "rewards/progression_diversity/mean": -0.0005383668467402458, + "rewards/progression_diversity/std": 0.006143567617982626, + "rewards/symbolic_reward_accuracy/mean": 0.1484375, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.624072253704071, + "rewards/symbolic_reward_partial_score/std": 0.22695200145244598, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0465407371520996, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 700.0, + "sampling/sampling_logp_difference/mean": 0.22680458426475525, + "step": 885 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.23101823031902313, + "epoch": 1.419871794871795, + "grad_norm": 0.01696612685918808, + "learning_rate": 1e-06, + "loss": 0.008, + "step": 886 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.22741740196943283, + "epoch": 1.421474358974359, + "grad_norm": 0.03519902750849724, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 887 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.22920458763837814, + "epoch": 1.4230769230769231, + "grad_norm": 0.01825595274567604, + "learning_rate": 1e-06, + "loss": -0.0034, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3919.0, + "completions/max_terminated_length": 3919.0, + "completions/mean_length": 1301.556640625, + "completions/mean_terminated_length": 1301.556640625, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "entropy": 0.22784828394651413, + "epoch": 1.4246794871794872, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.03417736664414406, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 756822881.0, + "reward": 0.27989476919174194, + "reward_std": 0.10528245568275452, + "rewards/progression_diversity/mean": -0.0002708068350329995, + "rewards/progression_diversity/std": 0.0028580420184880495, + "rewards/symbolic_reward_accuracy/mean": 0.13671875, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.6595540046691895, + "rewards/symbolic_reward_partial_score/std": 0.209646537899971, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0472159385681152, + "sampling/importance_sampling_ratio/min": 9.738879924725552e-09, + "sampling/sampling_logp_difference/max": 18.447139739990234, + "sampling/sampling_logp_difference/mean": 0.09813030064105988, + "step": 889 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23206569254398346, + "epoch": 1.4262820512820513, + "grad_norm": 0.02267414890229702, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 890 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.23299744725227356, + "epoch": 1.4278846153846154, + "grad_norm": 0.02226838655769825, + "learning_rate": 1e-06, + "loss": -0.0129, + "step": 891 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23174694180488586, + "epoch": 1.4294871794871795, + "grad_norm": 0.02325165644288063, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3854.0, + "completions/max_terminated_length": 3854.0, + "completions/mean_length": 1381.82421875, + "completions/mean_terminated_length": 1381.82421875, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.23948068916797638, + "epoch": 1.4310897435897436, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.02791544795036316, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 758422935.0, + "reward": 0.23517410457134247, + "reward_std": 0.05936092138290405, + "rewards/progression_diversity/mean": -0.00016881769988685846, + "rewards/progression_diversity/std": 0.0023643465247005224, + "rewards/symbolic_reward_accuracy/mean": 0.08984375, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.6042318344116211, + "rewards/symbolic_reward_partial_score/std": 0.19526702165603638, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0488243103027344, + "sampling/importance_sampling_ratio/min": 0.00036482111318036914, + "sampling/sampling_logp_difference/max": 7.916103363037109, + "sampling/sampling_logp_difference/mean": 0.10077446699142456, + "step": 893 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2307603806257248, + "epoch": 1.4326923076923077, + "grad_norm": 0.029786186292767525, + "learning_rate": 1e-06, + "loss": 0.0158, + "step": 894 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2346210777759552, + "epoch": 1.4342948717948718, + "grad_norm": 0.030796803534030914, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 895 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.24034950882196426, + "epoch": 1.435897435897436, + "grad_norm": 0.01626007631421089, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4579.0, + "completions/max_terminated_length": 4579.0, + "completions/mean_length": 1346.220703125, + "completions/mean_terminated_length": 1346.220703125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.2430427074432373, + "epoch": 1.4375, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.030239736661314964, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 759944376.0, + "reward": 0.2594153881072998, + "reward_std": 0.09177584946155548, + "rewards/progression_diversity/mean": -0.0003554845170583576, + "rewards/progression_diversity/std": 0.004576821345835924, + "rewards/symbolic_reward_accuracy/mean": 0.119140625, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.6264485120773315, + "rewards/symbolic_reward_partial_score/std": 0.23991042375564575, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0497946739196777, + "sampling/importance_sampling_ratio/min": 0.003207254223525524, + "sampling/sampling_logp_difference/max": 5.742340087890625, + "sampling/sampling_logp_difference/mean": 0.1022864505648613, + "step": 897 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.24024366587400436, + "epoch": 1.439102564102564, + "grad_norm": 0.02961251139640808, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 898 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.24130822718143463, + "epoch": 1.4407051282051282, + "grad_norm": 0.02147165685892105, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 899 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.3203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2432633340358734, + "epoch": 1.4423076923076923, + "grad_norm": 0.027493350207805634, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4780.0, + "completions/max_terminated_length": 4780.0, + "completions/mean_length": 1268.705078125, + "completions/mean_terminated_length": 1268.705078125, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 0.24526084959506989, + "epoch": 1.4439102564102564, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.02393464371562004, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 761386065.0, + "reward": 0.23188182711601257, + "reward_std": 0.0820198506116867, + "rewards/progression_diversity/mean": -0.0002951324568130076, + "rewards/progression_diversity/std": 0.0035472013987600803, + "rewards/symbolic_reward_accuracy/mean": 0.05078125, + "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, + "rewards/symbolic_reward_partial_score/mean": 0.67138671875, + "rewards/symbolic_reward_partial_score/std": 0.20371752977371216, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0510923862457275, + "sampling/importance_sampling_ratio/min": 4.374550189822912e-05, + "sampling/sampling_logp_difference/max": 10.037121772766113, + "sampling/sampling_logp_difference/mean": 0.103520967066288, + "step": 901 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.24283086508512497, + "epoch": 1.4455128205128205, + "grad_norm": 0.02904311753809452, + "learning_rate": 1e-06, + "loss": 0.0, + "step": 902 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.24352674186229706, + "epoch": 1.4471153846153846, + "grad_norm": 0.01613425463438034, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 903 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.25162430852651596, + "epoch": 1.4487179487179487, + "grad_norm": 0.02424633502960205, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4364.0, + "completions/max_terminated_length": 4364.0, + "completions/mean_length": 1284.42578125, + "completions/mean_terminated_length": 1284.42578125, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "entropy": 0.24817583709955215, + "epoch": 1.4503205128205128, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.025694923475384712, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 762867995.0, + "reward": 0.30710139870643616, + "reward_std": 0.05024458467960358, + "rewards/progression_diversity/mean": -0.0003102348418906331, + "rewards/progression_diversity/std": 0.003935862332582474, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.656494140625, + "rewards/symbolic_reward_partial_score/std": 0.2331247478723526, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0509289503097534, + "sampling/importance_sampling_ratio/min": 0.002624225802719593, + "sampling/sampling_logp_difference/max": 5.94296932220459, + "sampling/sampling_logp_difference/mean": 0.10329769551753998, + "step": 905 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.24394653737545013, + "epoch": 1.4519230769230769, + "grad_norm": 0.020293056964874268, + "learning_rate": 1e-06, + "loss": -0.0066, + "step": 906 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.24355324357748032, + "epoch": 1.453525641025641, + "grad_norm": 0.01819128543138504, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 907 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.241835318505764, + "epoch": 1.455128205128205, + "grad_norm": 0.01816965639591217, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3849.0, + "completions/max_terminated_length": 3849.0, + "completions/mean_length": 1251.197265625, + "completions/mean_terminated_length": 1251.197265625, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "entropy": 0.24188891798257828, + "epoch": 1.4567307692307692, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.02820868045091629, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 764313808.0, + "reward": 0.31225496530532837, + "reward_std": 0.08842569589614868, + "rewards/progression_diversity/mean": -9.080560994334519e-05, + "rewards/progression_diversity/std": 0.001025109551846981, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.6658528447151184, + "rewards/symbolic_reward_partial_score/std": 0.24876438081264496, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0511972904205322, + "sampling/importance_sampling_ratio/min": 0.0021847328171133995, + "sampling/sampling_logp_difference/max": 6.1262617111206055, + "sampling/sampling_logp_difference/mean": 0.1035478338599205, + "step": 909 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.24276289343833923, + "epoch": 1.4583333333333333, + "grad_norm": 0.023914597928524017, + "learning_rate": 1e-06, + "loss": -0.0036, + "step": 910 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2428332194685936, + "epoch": 1.4599358974358974, + "grad_norm": 0.018325267359614372, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 911 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.24748840928077698, + "epoch": 1.4615384615384617, + "grad_norm": 0.021704666316509247, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3406.0, + "completions/max_terminated_length": 3406.0, + "completions/mean_length": 1280.796875, + "completions/mean_terminated_length": 1280.796875, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.24230319261550903, + "epoch": 1.4631410256410255, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.026809457689523697, + "learning_rate": 1e-06, + "loss": -0.0139, + "num_tokens": 765825992.0, + "reward": 0.29785943031311035, + "reward_std": 0.1095178872346878, + "rewards/progression_diversity/mean": -0.00018848836771212518, + "rewards/progression_diversity/std": 0.0023818761110305786, + "rewards/symbolic_reward_accuracy/mean": 0.169921875, + "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, + "rewards/symbolic_reward_partial_score/mean": 0.653027355670929, + "rewards/symbolic_reward_partial_score/std": 0.24102628231048584, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0500684976577759, + "sampling/importance_sampling_ratio/min": 0.00195193977560848, + "sampling/sampling_logp_difference/max": 6.238931655883789, + "sampling/sampling_logp_difference/mean": 0.10224372148513794, + "step": 913 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.24049583077430725, + "epoch": 1.4647435897435899, + "grad_norm": 0.019987255334854126, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 914 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.23697714507579803, + "epoch": 1.4663461538461537, + "grad_norm": 0.014981388114392757, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 915 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.23524734377861023, + "epoch": 1.467948717948718, + "grad_norm": 0.019518760964274406, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3382.0, + "completions/max_terminated_length": 3382.0, + "completions/mean_length": 1324.177734375, + "completions/mean_terminated_length": 1324.177734375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "entropy": 0.23860549181699753, + "epoch": 1.469551282051282, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.034368306398391724, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 767335715.0, + "reward": 0.2800864577293396, + "reward_std": 0.11742740869522095, + "rewards/progression_diversity/mean": -0.0001446699898224324, + "rewards/progression_diversity/std": 0.0014527636812999845, + "rewards/symbolic_reward_accuracy/mean": 0.146484375, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.6406575441360474, + "rewards/symbolic_reward_partial_score/std": 0.218194380402565, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0496015548706055, + "sampling/importance_sampling_ratio/min": 2.004770264726896e-12, + "sampling/sampling_logp_difference/max": 26.93549156188965, + "sampling/sampling_logp_difference/mean": 0.10103441774845123, + "step": 917 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2368185818195343, + "epoch": 1.4711538461538463, + "grad_norm": 0.018526000902056694, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 918 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.24254272133111954, + "epoch": 1.4727564102564101, + "grad_norm": 0.021521111950278282, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 919 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.24013221263885498, + "epoch": 1.4743589743589745, + "grad_norm": 0.019323555752635002, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3175.0, + "completions/max_terminated_length": 3175.0, + "completions/mean_length": 1178.45703125, + "completions/mean_terminated_length": 1178.45703125, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.24065189808607101, + "epoch": 1.4759615384615383, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.027268243953585625, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 768849229.0, + "reward": 0.2435724139213562, + "reward_std": 0.0985773503780365, + "rewards/progression_diversity/mean": -0.00018066662596538663, + "rewards/progression_diversity/std": 0.0020558543037623167, + "rewards/symbolic_reward_accuracy/mean": 0.103515625, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.6048828363418579, + "rewards/symbolic_reward_partial_score/std": 0.21721047163009644, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0502285957336426, + "sampling/importance_sampling_ratio/min": 0.0018710603471845388, + "sampling/sampling_logp_difference/max": 6.28125, + "sampling/sampling_logp_difference/mean": 0.1014840379357338, + "step": 921 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2395830601453781, + "epoch": 1.4775641025641026, + "grad_norm": 0.017518319189548492, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 922 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.23873654007911682, + "epoch": 1.4791666666666667, + "grad_norm": 0.015559080988168716, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 923 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2355535551905632, + "epoch": 1.4807692307692308, + "grad_norm": 0.019503874704241753, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3569.0, + "completions/max_terminated_length": 3569.0, + "completions/mean_length": 1200.4765625, + "completions/mean_terminated_length": 1200.4765625, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.2354103922843933, + "epoch": 1.482371794871795, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.025359977036714554, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 770377745.0, + "reward": 0.29045236110687256, + "reward_std": 0.059886179864406586, + "rewards/progression_diversity/mean": -0.00017451572057325393, + "rewards/progression_diversity/std": 0.002117466414347291, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.6556802988052368, + "rewards/symbolic_reward_partial_score/std": 0.22488947212696075, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0493230819702148, + "sampling/importance_sampling_ratio/min": 0.0001748588983900845, + "sampling/sampling_logp_difference/max": 8.651531219482422, + "sampling/sampling_logp_difference/mean": 0.10150042176246643, + "step": 925 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.23772598803043365, + "epoch": 1.483974358974359, + "grad_norm": 0.019928786903619766, + "learning_rate": 1e-06, + "loss": 0.0116, + "step": 926 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.23272281885147095, + "epoch": 1.4855769230769231, + "grad_norm": 0.020350804552435875, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 927 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2364245057106018, + "epoch": 1.4871794871794872, + "grad_norm": 0.023986948654055595, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4100.0, + "completions/mean_length": 1294.845703125, + "completions/mean_terminated_length": 1235.672607421875, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "entropy": 0.23016077280044556, + "epoch": 1.4887820512820513, + "frac_reward_zero_std": 0.09375, + "grad_norm": 171.7614288330078, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 771956514.0, + "reward": 0.22332197427749634, + "reward_std": 0.07956110686063766, + "rewards/progression_diversity/mean": -0.0012982608750462532, + "rewards/progression_diversity/std": 0.026022804901003838, + "rewards/symbolic_reward_accuracy/mean": 0.05859375, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.6285644769668579, + "rewards/symbolic_reward_partial_score/std": 0.20716184377670288, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0451966524124146, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 1.1087749004364014, + "step": 929 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2362649217247963, + "epoch": 1.4903846153846154, + "grad_norm": 0.02201710268855095, + "learning_rate": 1e-06, + "loss": -0.0087, + "step": 930 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.22993260622024536, + "epoch": 1.4919871794871795, + "grad_norm": 0.020292161032557487, + "learning_rate": 1e-06, + "loss": 0.0475, + "step": 931 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.230661079287529, + "epoch": 1.4935897435897436, + "grad_norm": 0.021966397762298584, + "learning_rate": 1e-06, + "loss": -0.0086, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3575.0, + "completions/max_terminated_length": 3575.0, + "completions/mean_length": 1369.162109375, + "completions/mean_terminated_length": 1369.162109375, + "completions/min_length": 501.0, + "completions/min_terminated_length": 501.0, + "entropy": 0.23234926909208298, + "epoch": 1.4951923076923077, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.03726167976856232, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 773450389.0, + "reward": 0.29813235998153687, + "reward_std": 0.12365701794624329, + "rewards/progression_diversity/mean": -0.0002430165041005239, + "rewards/progression_diversity/std": 0.00278676301240921, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.6461262702941895, + "rewards/symbolic_reward_partial_score/std": 0.24039986729621887, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0482527017593384, + "sampling/importance_sampling_ratio/min": 5.24304415801171e-15, + "sampling/sampling_logp_difference/max": 32.881874084472656, + "sampling/sampling_logp_difference/mean": 0.10041865706443787, + "step": 933 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.23114058375358582, + "epoch": 1.4967948717948718, + "grad_norm": 0.02173478528857231, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 934 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2359921634197235, + "epoch": 1.498397435897436, + "grad_norm": 0.01734345592558384, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 935 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.296875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2320852428674698, + "epoch": 1.5, + "grad_norm": 0.025080712512135506, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3237.0, + "completions/max_terminated_length": 3237.0, + "completions/mean_length": 1222.326171875, + "completions/mean_terminated_length": 1222.326171875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.23251069337129593, + "epoch": 1.501602564102564, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.03220761939883232, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 774980300.0, + "reward": 0.26821303367614746, + "reward_std": 0.10178233683109283, + "rewards/progression_diversity/mean": -0.00047397619346156716, + "rewards/progression_diversity/std": 0.004046800546348095, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.6284343004226685, + "rewards/symbolic_reward_partial_score/std": 0.20864459872245789, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0485594272613525, + "sampling/importance_sampling_ratio/min": 0.0010224799625575542, + "sampling/sampling_logp_difference/max": 6.885524272918701, + "sampling/sampling_logp_difference/mean": 0.10064580291509628, + "step": 937 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.23433350771665573, + "epoch": 1.5032051282051282, + "grad_norm": 0.027498142793774605, + "learning_rate": 1e-06, + "loss": -0.0059, + "step": 938 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23485945910215378, + "epoch": 1.5048076923076923, + "grad_norm": 0.022965963929891586, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 939 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2323073446750641, + "epoch": 1.5064102564102564, + "grad_norm": 0.017642127349972725, + "learning_rate": 1e-06, + "loss": 0.011, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3077.0, + "completions/max_terminated_length": 3077.0, + "completions/mean_length": 1306.53515625, + "completions/mean_terminated_length": 1306.53515625, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "entropy": 0.23624175786972046, + "epoch": 1.5080128205128205, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.032414428889751434, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 776444462.0, + "reward": 0.35686036944389343, + "reward_std": 0.1280536651611328, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.240234375, + "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, + "rewards/symbolic_reward_partial_score/mean": 0.7090657949447632, + "rewards/symbolic_reward_partial_score/std": 0.2242516130208969, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0489134788513184, + "sampling/importance_sampling_ratio/min": 0.0029864604584872723, + "sampling/sampling_logp_difference/max": 5.813666343688965, + "sampling/sampling_logp_difference/mean": 0.10092984884977341, + "step": 941 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2372562438249588, + "epoch": 1.5096153846153846, + "grad_norm": 0.0257510244846344, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 942 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.23343972861766815, + "epoch": 1.5112179487179487, + "grad_norm": 0.018904410302639008, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 943 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.23894614726305008, + "epoch": 1.5128205128205128, + "grad_norm": 0.023683864623308182, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3415.0, + "completions/mean_length": 1307.0390625, + "completions/mean_terminated_length": 1277.5341796875, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "entropy": 0.23835492879152298, + "epoch": 1.5144230769230769, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.02669195644557476, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 777998866.0, + "reward": 0.27208733558654785, + "reward_std": 0.10921524465084076, + "rewards/progression_diversity/mean": -0.0007387942750938237, + "rewards/progression_diversity/std": 0.014222451485693455, + "rewards/symbolic_reward_accuracy/mean": 0.13671875, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.6341959834098816, + "rewards/symbolic_reward_partial_score/std": 0.22311989963054657, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0479480028152466, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.576566219329834, + "step": 945 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23516834527254105, + "epoch": 1.516025641025641, + "grad_norm": 0.02226245403289795, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 946 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.23766741156578064, + "epoch": 1.5176282051282053, + "grad_norm": 0.018796978518366814, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 947 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.24504876136779785, + "epoch": 1.5192307692307692, + "grad_norm": 0.019867926836013794, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3256.0, + "completions/max_terminated_length": 3256.0, + "completions/mean_length": 1339.611328125, + "completions/mean_terminated_length": 1339.611328125, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.24304132908582687, + "epoch": 1.5208333333333335, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.0299557913094759, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 779544475.0, + "reward": 0.23521313071250916, + "reward_std": 0.07730136811733246, + "rewards/progression_diversity/mean": -0.00017136070528067648, + "rewards/progression_diversity/std": 0.0020358404144644737, + "rewards/symbolic_reward_accuracy/mean": 0.07421875, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.6356120109558105, + "rewards/symbolic_reward_partial_score/std": 0.2210559844970703, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0506515502929688, + "sampling/importance_sampling_ratio/min": 7.503909728256986e-05, + "sampling/sampling_logp_difference/max": 9.497501373291016, + "sampling/sampling_logp_difference/mean": 0.10373318195343018, + "step": 949 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.24009553343057632, + "epoch": 1.5224358974358974, + "grad_norm": 0.01935042440891266, + "learning_rate": 1e-06, + "loss": -0.0066, + "step": 950 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2450866997241974, + "epoch": 1.5240384615384617, + "grad_norm": 0.02386186644434929, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 951 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.24486373364925385, + "epoch": 1.5256410256410255, + "grad_norm": 0.026845736429095268, + "learning_rate": 1e-06, + "loss": -0.0011, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3384.0, + "completions/mean_length": 1382.8046875, + "completions/mean_terminated_length": 1353.4481201171875, + "completions/min_length": 478.0, + "completions/min_terminated_length": 478.0, + "entropy": 0.24295812845230103, + "epoch": 1.5272435897435899, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.022905530408024788, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 781073095.0, + "reward": 0.2498174011707306, + "reward_std": 0.08474698662757874, + "rewards/progression_diversity/mean": -0.00019435372087173164, + "rewards/progression_diversity/std": 0.0025657941587269306, + "rewards/symbolic_reward_accuracy/mean": 0.10546875, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.6224446296691895, + "rewards/symbolic_reward_partial_score/std": 0.231918066740036, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.049942970275879, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.46404212713241577, + "step": 953 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.24674228578805923, + "epoch": 1.5288461538461537, + "grad_norm": 0.01813218928873539, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 954 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2422228530049324, + "epoch": 1.530448717948718, + "grad_norm": 1672.424560546875, + "learning_rate": 1e-06, + "loss": 0.0588, + "step": 955 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2425452098250389, + "epoch": 1.532051282051282, + "grad_norm": 0.017666012048721313, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3996.0, + "completions/max_terminated_length": 3996.0, + "completions/mean_length": 1283.716796875, + "completions/mean_terminated_length": 1283.716796875, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.2416147142648697, + "epoch": 1.5336538461538463, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.022795159369707108, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 782631542.0, + "reward": 0.29501813650131226, + "reward_std": 0.07359684258699417, + "rewards/progression_diversity/mean": -0.00013884622603654861, + "rewards/progression_diversity/std": 0.0017234663246199489, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.6748046875, + "rewards/symbolic_reward_partial_score/std": 0.21226514875888824, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0523028373718262, + "sampling/importance_sampling_ratio/min": 1.8659156353351136e-07, + "sampling/sampling_logp_difference/max": 15.494343757629395, + "sampling/sampling_logp_difference/mean": 0.10692096501588821, + "step": 957 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.24522508680820465, + "epoch": 1.5352564102564101, + "grad_norm": 0.01961701549589634, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 958 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.25065000355243683, + "epoch": 1.5368589743589745, + "grad_norm": 0.020117729902267456, + "learning_rate": 1e-06, + "loss": -0.0044, + "step": 959 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.25352972745895386, + "epoch": 1.5384615384615383, + "grad_norm": 0.019687386229634285, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3396.0, + "completions/mean_length": 1496.544921875, + "completions/mean_terminated_length": 1467.410888671875, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.23719095438718796, + "epoch": 1.5400641025641026, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.03074975125491619, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 784215197.0, + "reward": 0.3043617606163025, + "reward_std": 0.09115153551101685, + "rewards/progression_diversity/mean": -0.0003482637694105506, + "rewards/progression_diversity/std": 0.006979175843298435, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.6395508050918579, + "rewards/symbolic_reward_partial_score/std": 0.25730493664741516, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0488170385360718, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.163615345954895, + "step": 961 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2416258454322815, + "epoch": 1.5416666666666665, + "grad_norm": 0.02943342924118042, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 962 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.24121008068323135, + "epoch": 1.5432692307692308, + "grad_norm": 0.022504808381199837, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 963 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.23982763290405273, + "epoch": 1.5448717948717947, + "grad_norm": 0.026652559638023376, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3516.0, + "completions/max_terminated_length": 3516.0, + "completions/mean_length": 1295.953125, + "completions/mean_terminated_length": 1295.953125, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "entropy": 0.2383585274219513, + "epoch": 1.546474358974359, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.03317641466856003, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 785827077.0, + "reward": 0.2608430087566376, + "reward_std": 0.09105145186185837, + "rewards/progression_diversity/mean": -0.00017204870528075844, + "rewards/progression_diversity/std": 0.002580570289865136, + "rewards/symbolic_reward_accuracy/mean": 0.091796875, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.6858886480331421, + "rewards/symbolic_reward_partial_score/std": 0.18577834963798523, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0511445999145508, + "sampling/importance_sampling_ratio/min": 2.0922935606293436e-10, + "sampling/sampling_logp_difference/max": 22.28759002685547, + "sampling/sampling_logp_difference/mean": 0.10452830791473389, + "step": 965 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2417585551738739, + "epoch": 1.5480769230769231, + "grad_norm": 0.015128690749406815, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 966 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24052689224481583, + "epoch": 1.5496794871794872, + "grad_norm": 0.016625335440039635, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 967 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.23810868710279465, + "epoch": 1.5512820512820513, + "grad_norm": 0.024602653458714485, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3115.0, + "completions/max_terminated_length": 3115.0, + "completions/mean_length": 1370.482421875, + "completions/mean_terminated_length": 1370.482421875, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "entropy": 0.24552591890096664, + "epoch": 1.5528846153846154, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.030604401603341103, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 787405868.0, + "reward": 0.22608765959739685, + "reward_std": 0.0656827911734581, + "rewards/progression_diversity/mean": -0.00012242203229106963, + "rewards/progression_diversity/std": 0.0018643095390871167, + "rewards/symbolic_reward_accuracy/mean": 0.060546875, + "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, + "rewards/symbolic_reward_partial_score/mean": 0.6325358152389526, + "rewards/symbolic_reward_partial_score/std": 0.17896120250225067, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.051476240158081, + "sampling/importance_sampling_ratio/min": 0.003860935801640153, + "sampling/sampling_logp_difference/max": 5.556845664978027, + "sampling/sampling_logp_difference/mean": 0.10471828281879425, + "step": 969 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.24233480542898178, + "epoch": 1.5544871794871795, + "grad_norm": 0.017392568290233612, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 970 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2429162785410881, + "epoch": 1.5560897435897436, + "grad_norm": 0.02391919493675232, + "learning_rate": 1e-06, + "loss": -0.0042, + "step": 971 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.24405669420957565, + "epoch": 1.5576923076923077, + "grad_norm": 0.020459329709410667, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3250.0, + "completions/max_terminated_length": 3250.0, + "completions/mean_length": 1407.92578125, + "completions/mean_terminated_length": 1407.92578125, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 0.2465820536017418, + "epoch": 1.5592948717948718, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.02617330104112625, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 788955078.0, + "reward": 0.2446797788143158, + "reward_std": 0.07382390648126602, + "rewards/progression_diversity/mean": -0.00028444104827940464, + "rewards/progression_diversity/std": 0.002962089842185378, + "rewards/symbolic_reward_accuracy/mean": 0.078125, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.6593587398529053, + "rewards/symbolic_reward_partial_score/std": 0.18929307162761688, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0518171787261963, + "sampling/importance_sampling_ratio/min": 7.697512046433985e-06, + "sampling/sampling_logp_difference/max": 11.774613380432129, + "sampling/sampling_logp_difference/mean": 0.10526497662067413, + "step": 973 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.24588342010974884, + "epoch": 1.560897435897436, + "grad_norm": 0.022333713248372078, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 974 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.24611911922693253, + "epoch": 1.5625, + "grad_norm": 0.01700982078909874, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 975 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2488018348813057, + "epoch": 1.564102564102564, + "grad_norm": 0.01537299808114767, + "learning_rate": 1e-06, + "loss": -0.0055, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3609.0, + "completions/max_terminated_length": 3609.0, + "completions/mean_length": 1228.7265625, + "completions/mean_terminated_length": 1228.7265625, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "entropy": 0.2481737807393074, + "epoch": 1.5657051282051282, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.03226266801357269, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 790546554.0, + "reward": 0.19851306080818176, + "reward_std": 0.0660344660282135, + "rewards/progression_diversity/mean": -0.0002565660688560456, + "rewards/progression_diversity/std": 0.0029936530627310276, + "rewards/symbolic_reward_accuracy/mean": 0.044921875, + "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, + "rewards/symbolic_reward_partial_score/mean": 0.5718749761581421, + "rewards/symbolic_reward_partial_score/std": 0.20530447363853455, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0529671907424927, + "sampling/importance_sampling_ratio/min": 0.0003155683516524732, + "sampling/sampling_logp_difference/max": 8.061135292053223, + "sampling/sampling_logp_difference/mean": 0.10595399141311646, + "step": 977 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.24593167752027512, + "epoch": 1.5673076923076923, + "grad_norm": 0.017432967200875282, + "learning_rate": 1e-06, + "loss": -0.0049, + "step": 978 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.24341944605112076, + "epoch": 1.5689102564102564, + "grad_norm": 0.01746542379260063, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 979 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.24650365114212036, + "epoch": 1.5705128205128205, + "grad_norm": 0.015472476370632648, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3315.0, + "completions/max_terminated_length": 3315.0, + "completions/mean_length": 1509.9140625, + "completions/mean_terminated_length": 1509.9140625, + "completions/min_length": 491.0, + "completions/min_terminated_length": 491.0, + "entropy": 0.2508416026830673, + "epoch": 1.5721153846153846, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.025885384529829025, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 792077166.0, + "reward": 0.3290412425994873, + "reward_std": 0.11779160797595978, + "rewards/progression_diversity/mean": -0.00017644937906879932, + "rewards/progression_diversity/std": 0.0018998431041836739, + "rewards/symbolic_reward_accuracy/mean": 0.208984375, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.6788411140441895, + "rewards/symbolic_reward_partial_score/std": 0.22582782804965973, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0522162914276123, + "sampling/importance_sampling_ratio/min": 0.00034631905145943165, + "sampling/sampling_logp_difference/max": 7.9681501388549805, + "sampling/sampling_logp_difference/mean": 0.10585089027881622, + "step": 981 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.24679741263389587, + "epoch": 1.5737179487179487, + "grad_norm": 0.025917481631040573, + "learning_rate": 1e-06, + "loss": -0.0061, + "step": 982 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.24447974562644958, + "epoch": 1.5753205128205128, + "grad_norm": 0.020733220502734184, + "learning_rate": 1e-06, + "loss": -0.0044, + "step": 983 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2511764466762543, + "epoch": 1.5769230769230769, + "grad_norm": 0.022386424243450165, + "learning_rate": 1e-06, + "loss": 0.0115, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3325.0, + "completions/max_terminated_length": 3325.0, + "completions/mean_length": 1401.8359375, + "completions/mean_terminated_length": 1401.8359375, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "entropy": 0.25129084289073944, + "epoch": 1.578525641025641, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.03222787380218506, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 793645338.0, + "reward": 0.23190167546272278, + "reward_std": 0.05650953948497772, + "rewards/progression_diversity/mean": -0.0002622118918225169, + "rewards/progression_diversity/std": 0.0023251688107848167, + "rewards/symbolic_reward_accuracy/mean": 0.0703125, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.6323893666267395, + "rewards/symbolic_reward_partial_score/std": 0.19433899223804474, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0537452697753906, + "sampling/importance_sampling_ratio/min": 2.0533148514800814e-08, + "sampling/sampling_logp_difference/max": 17.70122528076172, + "sampling/sampling_logp_difference/mean": 0.10701755434274673, + "step": 985 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2517654523253441, + "epoch": 1.5801282051282053, + "grad_norm": 0.023150594905018806, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 986 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2524363100528717, + "epoch": 1.5817307692307692, + "grad_norm": 0.017600167542696, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 987 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2514464929699898, + "epoch": 1.5833333333333335, + "grad_norm": 0.014767357148230076, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3305.0, + "completions/mean_length": 1364.2421875, + "completions/mean_terminated_length": 1334.849365234375, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.24437803030014038, + "epoch": 1.5849358974358974, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.026894228532910347, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 795228758.0, + "reward": 0.25398755073547363, + "reward_std": 0.0541648268699646, + "rewards/progression_diversity/mean": -0.00017180124996230006, + "rewards/progression_diversity/std": 0.0016380609013140202, + "rewards/symbolic_reward_accuracy/mean": 0.095703125, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.6558756828308105, + "rewards/symbolic_reward_partial_score/std": 0.18879389762878418, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0525712966918945, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 0.19978845119476318, + "step": 989 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24415334314107895, + "epoch": 1.5865384615384617, + "grad_norm": 0.02484998293220997, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 990 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.25222089886665344, + "epoch": 1.5881410256410255, + "grad_norm": 0.01625809818506241, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 991 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2524196058511734, + "epoch": 1.5897435897435899, + "grad_norm": 0.019578304141759872, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3574.0, + "completions/max_terminated_length": 3574.0, + "completions/mean_length": 1432.29296875, + "completions/mean_terminated_length": 1432.29296875, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "entropy": 0.25225862860679626, + "epoch": 1.5913461538461537, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.02162688970565796, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 796763980.0, + "reward": 0.2408090978860855, + "reward_std": 0.06670250743627548, + "rewards/progression_diversity/mean": -0.00014496369112748653, + "rewards/progression_diversity/std": 0.0017526200972497463, + "rewards/symbolic_reward_accuracy/mean": 0.080078125, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.6425455808639526, + "rewards/symbolic_reward_partial_score/std": 0.1895546019077301, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0530476570129395, + "sampling/importance_sampling_ratio/min": 0.003646494820713997, + "sampling/sampling_logp_difference/max": 5.613988876342773, + "sampling/sampling_logp_difference/mean": 0.10609988868236542, + "step": 993 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2456853985786438, + "epoch": 1.592948717948718, + "grad_norm": 0.019648630172014236, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 994 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2553592026233673, + "epoch": 1.594551282051282, + "grad_norm": 0.025570129975676537, + "learning_rate": 1e-06, + "loss": 0.0049, + "step": 995 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24990757554769516, + "epoch": 1.5961538461538463, + "grad_norm": 0.04156876355409622, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3606.0, + "completions/max_terminated_length": 3606.0, + "completions/mean_length": 1444.166015625, + "completions/mean_terminated_length": 1444.166015625, + "completions/min_length": 420.0, + "completions/min_terminated_length": 420.0, + "entropy": 0.2480572983622551, + "epoch": 1.5977564102564101, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.029209570959210396, + "learning_rate": 1e-06, + "loss": -0.0069, + "num_tokens": 798317441.0, + "reward": 0.2392357736825943, + "reward_std": 0.06654238700866699, + "rewards/progression_diversity/mean": -0.0002508866600692272, + "rewards/progression_diversity/std": 0.0026269752997905016, + "rewards/symbolic_reward_accuracy/mean": 0.091796875, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.6138671636581421, + "rewards/symbolic_reward_partial_score/std": 0.2098008245229721, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0531139373779297, + "sampling/importance_sampling_ratio/min": 0.003923820797353983, + "sampling/sampling_logp_difference/max": 5.540689468383789, + "sampling/sampling_logp_difference/mean": 0.10546484589576721, + "step": 997 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.24528241902589798, + "epoch": 1.5993589743589745, + "grad_norm": 0.02387697622179985, + "learning_rate": 1e-06, + "loss": 0.0125, + "step": 998 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.24644892662763596, + "epoch": 1.6009615384615383, + "grad_norm": 0.023926518857479095, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 999 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.24555939435958862, + "epoch": 1.6025641025641026, + "grad_norm": 0.02301941066980362, + "learning_rate": 1e-06, + "loss": -0.0124, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3619.0, + "completions/mean_length": 1375.71875, + "completions/mean_terminated_length": 1346.3482666015625, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "entropy": 0.23569654673337936, + "epoch": 1.6041666666666665, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.03722744435071945, + "learning_rate": 1e-06, + "loss": -0.0089, + "num_tokens": 800005953.0, + "reward": 0.23674920201301575, + "reward_std": 0.0885818749666214, + "rewards/progression_diversity/mean": -0.0008616495179012418, + "rewards/progression_diversity/std": 0.013127473182976246, + "rewards/symbolic_reward_accuracy/mean": 0.095703125, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.598437488079071, + "rewards/symbolic_reward_partial_score/std": 0.22310422360897064, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0491344928741455, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 1.0696396827697754, + "step": 1001 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2391332983970642, + "epoch": 1.6057692307692308, + "grad_norm": 0.023793857544660568, + "learning_rate": 1e-06, + "loss": 0.0327, + "step": 1002 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2433808520436287, + "epoch": 1.6073717948717947, + "grad_norm": 0.01809019409120083, + "learning_rate": 1e-06, + "loss": 0.0041, + "step": 1003 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23954389244318008, + "epoch": 1.608974358974359, + "grad_norm": 0.017016278579831123, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3544.0, + "completions/max_terminated_length": 3544.0, + "completions/mean_length": 1332.548828125, + "completions/mean_terminated_length": 1332.548828125, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.24541258811950684, + "epoch": 1.6105769230769231, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.024081578478217125, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 801514282.0, + "reward": 0.2921689450740814, + "reward_std": 0.07876162230968475, + "rewards/progression_diversity/mean": -0.0003911844396498054, + "rewards/progression_diversity/std": 0.0043034846894443035, + "rewards/symbolic_reward_accuracy/mean": 0.140625, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.6926594972610474, + "rewards/symbolic_reward_partial_score/std": 0.20360209047794342, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.052016019821167, + "sampling/importance_sampling_ratio/min": 8.38755295262672e-06, + "sampling/sampling_logp_difference/max": 11.688761711120605, + "sampling/sampling_logp_difference/mean": 0.10484174638986588, + "step": 1005 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.23762197047472, + "epoch": 1.6121794871794872, + "grad_norm": 0.017401108518242836, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 1006 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.24435139447450638, + "epoch": 1.6137820512820513, + "grad_norm": 0.02926545962691307, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 1007 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24484486877918243, + "epoch": 1.6153846153846154, + "grad_norm": 0.023529132828116417, + "learning_rate": 1e-06, + "loss": -0.0068, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3515.0, + "completions/mean_length": 1400.970703125, + "completions/mean_terminated_length": 1371.649658203125, + "completions/min_length": 404.0, + "completions/min_terminated_length": 404.0, + "entropy": 0.23839521408081055, + "epoch": 1.6169871794871795, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.036472517997026443, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 803169707.0, + "reward": 0.23677846789360046, + "reward_std": 0.052502475678920746, + "rewards/progression_diversity/mean": -0.0003774126525968313, + "rewards/progression_diversity/std": 0.0034933658316731453, + "rewards/symbolic_reward_accuracy/mean": 0.0703125, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.6493000984191895, + "rewards/symbolic_reward_partial_score/std": 0.23349793255329132, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0512499809265137, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 228.0, + "sampling/sampling_logp_difference/mean": 0.11092744767665863, + "step": 1009 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24148935079574585, + "epoch": 1.6185897435897436, + "grad_norm": 0.029029127210378647, + "learning_rate": 1e-06, + "loss": 0.0061, + "step": 1010 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.24070608615875244, + "epoch": 1.6201923076923077, + "grad_norm": 0.020563537254929543, + "learning_rate": 1e-06, + "loss": 0.009, + "step": 1011 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.24124725908041, + "epoch": 1.6217948717948718, + "grad_norm": 0.016905367374420166, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3733.0, + "completions/mean_length": 1487.85546875, + "completions/mean_terminated_length": 1400.0589599609375, + "completions/min_length": 485.0, + "completions/min_terminated_length": 485.0, + "entropy": 0.2362624704837799, + "epoch": 1.623397435897436, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.025372762233018875, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 804794641.0, + "reward": 0.2821791172027588, + "reward_std": 0.1007528007030487, + "rewards/progression_diversity/mean": -0.0003515832358971238, + "rewards/progression_diversity/std": 0.004287549760192633, + "rewards/symbolic_reward_accuracy/mean": 0.150390625, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.6417806148529053, + "rewards/symbolic_reward_partial_score/std": 0.22223784029483795, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0479587316513062, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 0.5552178621292114, + "step": 1013 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.237822063267231, + "epoch": 1.625, + "grad_norm": 0.017621813341975212, + "learning_rate": 1e-06, + "loss": -0.0144, + "step": 1014 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.24151387810707092, + "epoch": 1.626602564102564, + "grad_norm": 0.01545657031238079, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 1015 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23863951116800308, + "epoch": 1.6282051282051282, + "grad_norm": 153.45928955078125, + "learning_rate": 1e-06, + "loss": 0.0558, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4848.0, + "completions/mean_length": 1533.744140625, + "completions/mean_terminated_length": 1504.6829833984375, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "entropy": 0.23886650800704956, + "epoch": 1.6298076923076923, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.03264259546995163, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 806450974.0, + "reward": 0.1910867989063263, + "reward_std": 0.055806443095207214, + "rewards/progression_diversity/mean": -0.0006959763122722507, + "rewards/progression_diversity/std": 0.00666604982689023, + "rewards/symbolic_reward_accuracy/mean": 0.025390625, + "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, + "rewards/symbolic_reward_partial_score/mean": 0.5868489742279053, + "rewards/symbolic_reward_partial_score/std": 0.21801504492759705, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0492140054702759, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.3248835504055023, + "step": 1017 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23825782537460327, + "epoch": 1.6314102564102564, + "grad_norm": 0.02487725391983986, + "learning_rate": 1e-06, + "loss": 0.0279, + "step": 1018 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23552914708852768, + "epoch": 1.6330128205128205, + "grad_norm": 0.029635349288582802, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 1019 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2386602759361267, + "epoch": 1.6346153846153846, + "grad_norm": 0.02634367346763611, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3918.0, + "completions/mean_length": 1762.474609375, + "completions/mean_terminated_length": 1676.2967529296875, + "completions/min_length": 469.0, + "completions/min_terminated_length": 469.0, + "entropy": 0.23500239849090576, + "epoch": 1.6362179487179487, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.02729848027229309, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 808198689.0, + "reward": 0.29712605476379395, + "reward_std": 0.09707895666360855, + "rewards/progression_diversity/mean": -0.0002862706023734063, + "rewards/progression_diversity/std": 0.004292279481887817, + "rewards/symbolic_reward_accuracy/mean": 0.1484375, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.695507824420929, + "rewards/symbolic_reward_partial_score/std": 0.23922735452651978, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.047842025756836, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 0.4406740665435791, + "step": 1021 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.23327183723449707, + "epoch": 1.6378205128205128, + "grad_norm": 20.64282989501953, + "learning_rate": 1e-06, + "loss": 0.0436, + "step": 1022 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22972074151039124, + "epoch": 1.6394230769230769, + "grad_norm": 0.02955857291817665, + "learning_rate": 1e-06, + "loss": 0.0107, + "step": 1023 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.23090046644210815, + "epoch": 1.641025641025641, + "grad_norm": 0.025243405252695084, + "learning_rate": 1e-06, + "loss": -0.0164, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4233.0, + "completions/mean_length": 1425.66796875, + "completions/mean_terminated_length": 1396.395263671875, + "completions/min_length": 325.0, + "completions/min_terminated_length": 325.0, + "entropy": 0.24533987045288086, + "epoch": 1.6426282051282053, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.031315580010414124, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 809735287.0, + "reward": 0.2972439229488373, + "reward_std": 0.09149158000946045, + "rewards/progression_diversity/mean": -0.00021706035477109253, + "rewards/progression_diversity/std": 0.0027459680568426847, + "rewards/symbolic_reward_accuracy/mean": 0.1640625, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.6633463501930237, + "rewards/symbolic_reward_partial_score/std": 0.2075018286705017, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0503005981445312, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 704.0, + "sampling/sampling_logp_difference/mean": 0.17904753983020782, + "step": 1025 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.23708289116621017, + "epoch": 1.6442307692307692, + "grad_norm": 0.04043427109718323, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 1026 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2370290458202362, + "epoch": 1.6458333333333335, + "grad_norm": 0.019201675429940224, + "learning_rate": 1e-06, + "loss": 0.0346, + "step": 1027 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23796197026968002, + "epoch": 1.6474358974358974, + "grad_norm": 0.021262312307953835, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4151.0, + "completions/max_terminated_length": 4151.0, + "completions/mean_length": 1320.484375, + "completions/mean_terminated_length": 1320.484375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.22943055629730225, + "epoch": 1.6490384615384617, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.03114836849272251, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 811322847.0, + "reward": 0.31148767471313477, + "reward_std": 0.11784271150827408, + "rewards/progression_diversity/mean": -0.00015914140385575593, + "rewards/progression_diversity/std": 0.0020391715224832296, + "rewards/symbolic_reward_accuracy/mean": 0.181640625, + "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, + "rewards/symbolic_reward_partial_score/mean": 0.6750162839889526, + "rewards/symbolic_reward_partial_score/std": 0.22747963666915894, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0495221614837646, + "sampling/importance_sampling_ratio/min": 0.0016271222848445177, + "sampling/sampling_logp_difference/max": 6.420942306518555, + "sampling/sampling_logp_difference/mean": 0.10276812314987183, + "step": 1029 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.23415642231702805, + "epoch": 1.6506410256410255, + "grad_norm": 0.017106305807828903, + "learning_rate": 1e-06, + "loss": -0.007, + "step": 1030 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2420409992337227, + "epoch": 1.6522435897435899, + "grad_norm": 0.021166684105992317, + "learning_rate": 1e-06, + "loss": 0.0165, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.22991270571947098, + "epoch": 1.6538461538461537, + "grad_norm": 0.026932181790471077, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3803.0, + "completions/mean_length": 1430.66015625, + "completions/mean_terminated_length": 1372.0196533203125, + "completions/min_length": 468.0, + "completions/min_terminated_length": 468.0, + "entropy": 0.23568789660930634, + "epoch": 1.655448717948718, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02257698029279709, + "learning_rate": 1e-06, + "loss": 0.0309, + "num_tokens": 812878337.0, + "reward": 0.372047483921051, + "reward_std": 0.05590501427650452, + "rewards/progression_diversity/mean": -0.0008204940240830183, + "rewards/progression_diversity/std": 0.008694627322256565, + "rewards/symbolic_reward_accuracy/mean": 0.255859375, + "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, + "rewards/symbolic_reward_partial_score/mean": 0.7297688722610474, + "rewards/symbolic_reward_partial_score/std": 0.2233889102935791, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0493979454040527, + "sampling/importance_sampling_ratio/min": 1.542988610145346e-12, + "sampling/sampling_logp_difference/max": 27.19729995727539, + "sampling/sampling_logp_difference/mean": 0.10240708291530609, + "step": 1033 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2355349287390709, + "epoch": 1.657051282051282, + "grad_norm": 0.017844535410404205, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 1034 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.23197440803050995, + "epoch": 1.6586538461538463, + "grad_norm": 0.020765669643878937, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 1035 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.23954713344573975, + "epoch": 1.6602564102564101, + "grad_norm": 0.02009597420692444, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4692.0, + "completions/max_terminated_length": 4692.0, + "completions/mean_length": 1252.927734375, + "completions/mean_terminated_length": 1252.927734375, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "entropy": 0.23540305346250534, + "epoch": 1.6618589743589745, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.022153835743665695, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 814426508.0, + "reward": 0.29029542207717896, + "reward_std": 0.04895063489675522, + "rewards/progression_diversity/mean": -0.00024394365027546883, + "rewards/progression_diversity/std": 0.0027705603279173374, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.7176594734191895, + "rewards/symbolic_reward_partial_score/std": 0.19320081174373627, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0504940748214722, + "sampling/importance_sampling_ratio/min": 0.0024849912151694298, + "sampling/sampling_logp_difference/max": 5.997486114501953, + "sampling/sampling_logp_difference/mean": 0.10428472608327866, + "step": 1037 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.24068902432918549, + "epoch": 1.6634615384615383, + "grad_norm": 0.021785540506243706, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 1038 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.23235713690519333, + "epoch": 1.6650641025641026, + "grad_norm": 0.020368505269289017, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 1039 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.236878864467144, + "epoch": 1.6666666666666665, + "grad_norm": 0.0215722918510437, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4650.0, + "completions/mean_length": 1380.171875, + "completions/mean_terminated_length": 1291.74072265625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.23111912608146667, + "epoch": 1.6682692307692308, + "frac_reward_zero_std": 0.1875, + "grad_norm": 2.0536277294158936, + "learning_rate": 1e-06, + "loss": 0.0259, + "num_tokens": 816040052.0, + "reward": 0.26442086696624756, + "reward_std": 0.0863037258386612, + "rewards/progression_diversity/mean": -0.0022489791736006737, + "rewards/progression_diversity/std": 0.027088049799203873, + "rewards/symbolic_reward_accuracy/mean": 0.123046875, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.6373372077941895, + "rewards/symbolic_reward_partial_score/std": 0.2190377116203308, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.047485589981079, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 0.2174200415611267, + "step": 1041 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.23076869547367096, + "epoch": 1.6698717948717947, + "grad_norm": 0.016981706023216248, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 1042 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.22752298414707184, + "epoch": 1.671474358974359, + "grad_norm": 0.019416002556681633, + "learning_rate": 1e-06, + "loss": 0.0329, + "step": 1043 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.23331021517515182, + "epoch": 1.6730769230769231, + "grad_norm": 0.015192318707704544, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3838.0, + "completions/max_terminated_length": 3838.0, + "completions/mean_length": 1214.56640625, + "completions/mean_terminated_length": 1214.56640625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.24394644051790237, + "epoch": 1.6746794871794872, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.03186223655939102, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 817475990.0, + "reward": 0.31599071621894836, + "reward_std": 0.08919353783130646, + "rewards/progression_diversity/mean": -4.969753717887215e-05, + "rewards/progression_diversity/std": 0.0011245269561186433, + "rewards/symbolic_reward_accuracy/mean": 0.19140625, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.6704915165901184, + "rewards/symbolic_reward_partial_score/std": 0.22778773307800293, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0511565208435059, + "sampling/importance_sampling_ratio/min": 0.00036043746513314545, + "sampling/sampling_logp_difference/max": 7.928192138671875, + "sampling/sampling_logp_difference/mean": 0.1060151681303978, + "step": 1045 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.23804038763046265, + "epoch": 1.6762820512820513, + "grad_norm": 0.012359860353171825, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1046 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.24049938470125198, + "epoch": 1.6778846153846154, + "grad_norm": 0.01780606620013714, + "learning_rate": 1e-06, + "loss": 0.0041, + "step": 1047 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2433236464858055, + "epoch": 1.6794871794871795, + "grad_norm": 0.018220573663711548, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3815.0, + "completions/max_terminated_length": 3815.0, + "completions/mean_length": 1282.876953125, + "completions/mean_terminated_length": 1282.876953125, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.24433332681655884, + "epoch": 1.6810897435897436, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.032106902450323105, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 818907495.0, + "reward": 0.29857733845710754, + "reward_std": 0.13391146063804626, + "rewards/progression_diversity/mean": -0.0001772599498508498, + "rewards/progression_diversity/std": 0.0024098155554383993, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.6280761957168579, + "rewards/symbolic_reward_partial_score/std": 0.25627195835113525, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0512542724609375, + "sampling/importance_sampling_ratio/min": 9.631957800593227e-05, + "sampling/sampling_logp_difference/max": 9.247838973999023, + "sampling/sampling_logp_difference/mean": 0.10506170988082886, + "step": 1049 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.24064549058675766, + "epoch": 1.6826923076923077, + "grad_norm": 0.01762561686336994, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1050 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.24004538357257843, + "epoch": 1.6842948717948718, + "grad_norm": 0.022497214376926422, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 1051 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2305016741156578, + "epoch": 1.685897435897436, + "grad_norm": 0.02127472683787346, + "learning_rate": 1e-06, + "loss": -0.0066, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3644.0, + "completions/max_terminated_length": 3644.0, + "completions/mean_length": 1173.380859375, + "completions/mean_terminated_length": 1173.380859375, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 0.23441405594348907, + "epoch": 1.6875, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.02767036482691765, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 820447722.0, + "reward": 0.2577317953109741, + "reward_std": 0.04780818521976471, + "rewards/progression_diversity/mean": -0.0002602554450277239, + "rewards/progression_diversity/std": 0.0028319996781647205, + "rewards/symbolic_reward_accuracy/mean": 0.11328125, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.6325520873069763, + "rewards/symbolic_reward_partial_score/std": 0.20778940618038177, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0493583679199219, + "sampling/importance_sampling_ratio/min": 0.0004806618671864271, + "sampling/sampling_logp_difference/max": 7.640346527099609, + "sampling/sampling_logp_difference/mean": 0.10264922678470612, + "step": 1053 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.22839296609163284, + "epoch": 1.689102564102564, + "grad_norm": 0.01796301268041134, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1054 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.22674311697483063, + "epoch": 1.6907051282051282, + "grad_norm": 0.01843862235546112, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.22671975195407867, + "epoch": 1.6923076923076923, + "grad_norm": 0.02962394803762436, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3951.0, + "completions/mean_length": 1224.1875, + "completions/mean_terminated_length": 1194.5205078125, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "entropy": 0.22822780907154083, + "epoch": 1.6939102564102564, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.06622759997844696, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 821993514.0, + "reward": 0.26173463463783264, + "reward_std": 0.07937778532505035, + "rewards/progression_diversity/mean": -0.00036702307988889515, + "rewards/progression_diversity/std": 0.0035130290780216455, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.6387370228767395, + "rewards/symbolic_reward_partial_score/std": 0.2266356348991394, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0486118793487549, + "sampling/importance_sampling_ratio/min": 1.0303001507505627e-11, + "sampling/sampling_logp_difference/max": 25.298585891723633, + "sampling/sampling_logp_difference/mean": 0.10055477917194366, + "step": 1057 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23284170031547546, + "epoch": 1.6955128205128205, + "grad_norm": 0.02792915143072605, + "learning_rate": 1e-06, + "loss": -0.0146, + "step": 1058 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.22962763160467148, + "epoch": 1.6971153846153846, + "grad_norm": 0.018218589946627617, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2237945944070816, + "epoch": 1.6987179487179487, + "grad_norm": 0.01743151806294918, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3416.0, + "completions/mean_length": 1289.6484375, + "completions/mean_terminated_length": 1260.109619140625, + "completions/min_length": 376.0, + "completions/min_terminated_length": 376.0, + "entropy": 0.22563714534044266, + "epoch": 1.7003205128205128, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.027078071609139442, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 823552902.0, + "reward": 0.27887141704559326, + "reward_std": 0.07552332431077957, + "rewards/progression_diversity/mean": -6.769368337700143e-05, + "rewards/progression_diversity/std": 0.0008789710118435323, + "rewards/symbolic_reward_accuracy/mean": 0.146484375, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.637255847454071, + "rewards/symbolic_reward_partial_score/std": 0.2156410962343216, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0477631092071533, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 0.3541214168071747, + "step": 1061 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.22320057451725006, + "epoch": 1.7019230769230769, + "grad_norm": 614.7477416992188, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 1062 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2247406244277954, + "epoch": 1.703525641025641, + "grad_norm": 0.016136281192302704, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 1063 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.22823335230350494, + "epoch": 1.7051282051282053, + "grad_norm": 0.02105732634663582, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4059.0, + "completions/mean_length": 1367.603515625, + "completions/mean_terminated_length": 1338.2171630859375, + "completions/min_length": 352.0, + "completions/min_terminated_length": 352.0, + "entropy": 0.2308506816625595, + "epoch": 1.7067307692307692, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.027784336358308792, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 825156523.0, + "reward": 0.2746616005897522, + "reward_std": 0.09659157693386078, + "rewards/progression_diversity/mean": -0.0001522963575553149, + "rewards/progression_diversity/std": 0.0019277379615232348, + "rewards/symbolic_reward_accuracy/mean": 0.134765625, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.6466634273529053, + "rewards/symbolic_reward_partial_score/std": 0.22976163029670715, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0482929944992065, + "sampling/importance_sampling_ratio/min": 4.196563168079592e-06, + "sampling/sampling_logp_difference/max": 12.381244659423828, + "sampling/sampling_logp_difference/mean": 0.10019037127494812, + "step": 1065 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.22658158838748932, + "epoch": 1.7083333333333335, + "grad_norm": 0.019185129553079605, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 1066 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2332712933421135, + "epoch": 1.7099358974358974, + "grad_norm": 0.0282441433519125, + "learning_rate": 1e-06, + "loss": 0.0069, + "step": 1067 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.23132744431495667, + "epoch": 1.7115384615384617, + "grad_norm": 0.160556823015213, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4672.0, + "completions/max_terminated_length": 4672.0, + "completions/mean_length": 1349.27734375, + "completions/mean_terminated_length": 1349.27734375, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.22879090905189514, + "epoch": 1.7131410256410255, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.10462523251771927, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 826677881.0, + "reward": 0.2972085177898407, + "reward_std": 0.11050209403038025, + "rewards/progression_diversity/mean": -0.0008295955485664308, + "rewards/progression_diversity/std": 0.008723512291908264, + "rewards/symbolic_reward_accuracy/mean": 0.158203125, + "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, + "rewards/symbolic_reward_partial_score/mean": 0.67431640625, + "rewards/symbolic_reward_partial_score/std": 0.23070181906223297, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0478163957595825, + "sampling/importance_sampling_ratio/min": 0.0023437354248017073, + "sampling/sampling_logp_difference/max": 6.056009292602539, + "sampling/sampling_logp_difference/mean": 0.09988612681627274, + "step": 1069 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.22955607622861862, + "epoch": 1.7147435897435899, + "grad_norm": 0.022891266271471977, + "learning_rate": 1e-06, + "loss": 0.0089, + "step": 1070 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.23325985670089722, + "epoch": 1.7163461538461537, + "grad_norm": 0.021752603352069855, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 1071 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2312638759613037, + "epoch": 1.717948717948718, + "grad_norm": 0.016995754092931747, + "learning_rate": 1e-06, + "loss": 0.0081, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3701.0, + "completions/mean_length": 1393.966796875, + "completions/mean_terminated_length": 1364.632080078125, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.23279593884944916, + "epoch": 1.719551282051282, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.02495507150888443, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 828169640.0, + "reward": 0.2605937719345093, + "reward_std": 0.053633954375982285, + "rewards/progression_diversity/mean": -0.0006842931034043431, + "rewards/progression_diversity/std": 0.006281269248574972, + "rewards/symbolic_reward_accuracy/mean": 0.111328125, + "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, + "rewards/symbolic_reward_partial_score/mean": 0.6466634273529053, + "rewards/symbolic_reward_partial_score/std": 0.21451237797737122, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0482239723205566, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 149.9818115234375, + "sampling/sampling_logp_difference/mean": 0.10541543364524841, + "step": 1073 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2331898808479309, + "epoch": 1.7211538461538463, + "grad_norm": 0.02485121600329876, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 1074 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2273246869444847, + "epoch": 1.7227564102564101, + "grad_norm": 0.025766994804143906, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1075 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2262198030948639, + "epoch": 1.7243589743589745, + "grad_norm": 70.85974884033203, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12415.0, + "completions/mean_length": 1387.005859375, + "completions/mean_terminated_length": 1298.614990234375, + "completions/min_length": 303.0, + "completions/min_terminated_length": 303.0, + "entropy": 0.22221150994300842, + "epoch": 1.7259615384615383, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.019594384357333183, + "learning_rate": 1e-06, + "loss": 0.0331, + "num_tokens": 829798859.0, + "reward": 0.3287256360054016, + "reward_std": 0.0677950382232666, + "rewards/progression_diversity/mean": -0.0004827457305509597, + "rewards/progression_diversity/std": 0.005075244233012199, + "rewards/symbolic_reward_accuracy/mean": 0.208984375, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.6797525882720947, + "rewards/symbolic_reward_partial_score/std": 0.23522421717643738, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0458087921142578, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.522271990776062, + "step": 1077 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2273356318473816, + "epoch": 1.7275641025641026, + "grad_norm": 0.018257969990372658, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 1078 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.22496186941862106, + "epoch": 1.7291666666666665, + "grad_norm": 0.01670248992741108, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 1079 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2231815606355667, + "epoch": 1.7307692307692308, + "grad_norm": 0.021122116595506668, + "learning_rate": 1e-06, + "loss": 0.061, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4567.0, + "completions/max_terminated_length": 4567.0, + "completions/mean_length": 1337.560546875, + "completions/mean_terminated_length": 1337.560546875, + "completions/min_length": 488.0, + "completions/min_terminated_length": 488.0, + "entropy": 0.2179974913597107, + "epoch": 1.7323717948717947, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02882332168519497, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 831412682.0, + "reward": 0.25469720363616943, + "reward_std": 0.042546242475509644, + "rewards/progression_diversity/mean": -0.0004942620871588588, + "rewards/progression_diversity/std": 0.004545349627733231, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.6068196892738342, + "rewards/symbolic_reward_partial_score/std": 0.2223115861415863, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0473459959030151, + "sampling/importance_sampling_ratio/min": 2.902779897340224e-06, + "sampling/sampling_logp_difference/max": 12.749841690063477, + "sampling/sampling_logp_difference/mean": 0.09889820963144302, + "step": 1081 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22881120443344116, + "epoch": 1.733974358974359, + "grad_norm": 0.024050477892160416, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 1082 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.22975780814886093, + "epoch": 1.7355769230769231, + "grad_norm": 0.014919934794306755, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 1083 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.22842169553041458, + "epoch": 1.7371794871794872, + "grad_norm": 0.015330510213971138, + "learning_rate": 1e-06, + "loss": -0.0049, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3636.0, + "completions/max_terminated_length": 3636.0, + "completions/mean_length": 1231.79296875, + "completions/mean_terminated_length": 1231.79296875, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.23932421952486038, + "epoch": 1.7387820512820513, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.02318231202661991, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 832881792.0, + "reward": 0.25613856315612793, + "reward_std": 0.03916171193122864, + "rewards/progression_diversity/mean": -0.0004032338911201805, + "rewards/progression_diversity/std": 0.0032340919133275747, + "rewards/symbolic_reward_accuracy/mean": 0.09765625, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.658496081829071, + "rewards/symbolic_reward_partial_score/std": 0.20491082966327667, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.050029993057251, + "sampling/importance_sampling_ratio/min": 0.0007102126837708056, + "sampling/sampling_logp_difference/max": 7.249946117401123, + "sampling/sampling_logp_difference/mean": 0.10211381316184998, + "step": 1085 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.23092254996299744, + "epoch": 1.7403846153846154, + "grad_norm": 0.013799590058624744, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 1086 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.23684141784906387, + "epoch": 1.7419871794871795, + "grad_norm": 0.018218791112303734, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 1087 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2406981661915779, + "epoch": 1.7435897435897436, + "grad_norm": 0.012886385433375835, + "learning_rate": 1e-06, + "loss": 0.0037, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4425.0, + "completions/mean_length": 1411.501953125, + "completions/mean_terminated_length": 1233.9625244140625, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.22861982136964798, + "epoch": 1.7451923076923077, + "frac_reward_zero_std": 0.28125, + "grad_norm": 152.24722290039062, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 834562321.0, + "reward": 0.2795882225036621, + "reward_std": 0.03494984656572342, + "rewards/progression_diversity/mean": -0.0011390319559723139, + "rewards/progression_diversity/std": 0.019081035628914833, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.6273112297058105, + "rewards/symbolic_reward_partial_score/std": 0.21833449602127075, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0453113317489624, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 0.9983885884284973, + "step": 1089 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2288360819220543, + "epoch": 1.7467948717948718, + "grad_norm": 0.01483655534684658, + "learning_rate": 1e-06, + "loss": 0.0598, + "step": 1090 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2326875552535057, + "epoch": 1.748397435897436, + "grad_norm": 136.22293090820312, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1091 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.233309805393219, + "epoch": 1.75, + "grad_norm": 0.023925775662064552, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3879.0, + "completions/mean_length": 1282.169921875, + "completions/mean_terminated_length": 1252.616455078125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "entropy": 0.23763851076364517, + "epoch": 1.751602564102564, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.028567634522914886, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 836112856.0, + "reward": 0.2890186309814453, + "reward_std": 0.06066850945353508, + "rewards/progression_diversity/mean": -0.00048197241267189384, + "rewards/progression_diversity/std": 0.0038850673008710146, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.6984374523162842, + "rewards/symbolic_reward_partial_score/std": 0.21642756462097168, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0489928722381592, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.8276915550231934, + "step": 1093 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23289386928081512, + "epoch": 1.7532051282051282, + "grad_norm": 0.015758074820041656, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 1094 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.23799997568130493, + "epoch": 1.7548076923076923, + "grad_norm": 0.026696138083934784, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 1095 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2384975180029869, + "epoch": 1.7564102564102564, + "grad_norm": 0.02093910239636898, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3248.0, + "completions/max_terminated_length": 3248.0, + "completions/mean_length": 1161.291015625, + "completions/mean_terminated_length": 1161.291015625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.23988765478134155, + "epoch": 1.7580128205128205, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.025087056681513786, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 837526381.0, + "reward": 0.32568255066871643, + "reward_std": 0.07381178438663483, + "rewards/progression_diversity/mean": -0.00010529650171520188, + "rewards/progression_diversity/std": 0.0017732558771967888, + "rewards/symbolic_reward_accuracy/mean": 0.21484375, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.6559244990348816, + "rewards/symbolic_reward_partial_score/std": 0.2370387762784958, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.051661729812622, + "sampling/importance_sampling_ratio/min": 8.779976212736074e-08, + "sampling/sampling_logp_difference/max": 16.248207092285156, + "sampling/sampling_logp_difference/mean": 0.10443771630525589, + "step": 1097 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.24236983805894852, + "epoch": 1.7596153846153846, + "grad_norm": 0.01639835350215435, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 1098 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23802829533815384, + "epoch": 1.7612179487179487, + "grad_norm": 0.01632961444556713, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 1099 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.23437372595071793, + "epoch": 1.7628205128205128, + "grad_norm": 0.013542949222028255, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3300.0, + "completions/mean_length": 1334.92578125, + "completions/mean_terminated_length": 1275.909912109375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.22916942834854126, + "epoch": 1.7644230769230769, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.020054111257195473, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 839122695.0, + "reward": 0.2580447793006897, + "reward_std": 0.033093225210905075, + "rewards/progression_diversity/mean": -0.00021209442638792098, + "rewards/progression_diversity/std": 0.0026572090573608875, + "rewards/symbolic_reward_accuracy/mean": 0.107421875, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.6466145515441895, + "rewards/symbolic_reward_partial_score/std": 0.1929846704006195, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0464985370635986, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.6539708971977234, + "step": 1101 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.22496525943279266, + "epoch": 1.766025641025641, + "grad_norm": 698.2822265625, + "learning_rate": 1e-06, + "loss": 0.0585, + "step": 1102 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22553813457489014, + "epoch": 1.7676282051282053, + "grad_norm": 0.027719808742403984, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 1103 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2250175029039383, + "epoch": 1.7692307692307692, + "grad_norm": 0.015375426970422268, + "learning_rate": 1e-06, + "loss": 0.0297, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3192.0, + "completions/mean_length": 1293.3046875, + "completions/mean_terminated_length": 1174.4803466796875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.22259868681430817, + "epoch": 1.7708333333333335, + "frac_reward_zero_std": 0.34375, + "grad_norm": 692.8626098632812, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 840688755.0, + "reward": 0.3711351156234741, + "reward_std": 0.09630028158426285, + "rewards/progression_diversity/mean": -0.001723826164379716, + "rewards/progression_diversity/std": 0.026012076064944267, + "rewards/symbolic_reward_accuracy/mean": 0.271484375, + "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, + "rewards/symbolic_reward_partial_score/mean": 0.6968098878860474, + "rewards/symbolic_reward_partial_score/std": 0.23859074711799622, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0450929403305054, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.8819183111190796, + "step": 1105 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.22188594937324524, + "epoch": 1.7724358974358974, + "grad_norm": 0.017745744436979294, + "learning_rate": 1e-06, + "loss": -0.0115, + "step": 1106 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22368201613426208, + "epoch": 1.7740384615384617, + "grad_norm": 0.01989768259227276, + "learning_rate": 1e-06, + "loss": 0.037, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22869247198104858, + "epoch": 1.7756410256410255, + "grad_norm": 0.013347904197871685, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3432.0, + "completions/mean_length": 1230.427734375, + "completions/mean_terminated_length": 1171.0020751953125, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.22080733627080917, + "epoch": 1.7772435897435899, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.01532725803554058, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 842301870.0, + "reward": 0.2819012403488159, + "reward_std": 0.061507582664489746, + "rewards/progression_diversity/mean": -0.0003044310724362731, + "rewards/progression_diversity/std": 0.005076899658888578, + "rewards/symbolic_reward_accuracy/mean": 0.142578125, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.6558268070220947, + "rewards/symbolic_reward_partial_score/std": 0.21851463615894318, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0459144115447998, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.5158374905586243, + "step": 1109 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2195032313466072, + "epoch": 1.7788461538461537, + "grad_norm": 0.027773037552833557, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1110 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2160264179110527, + "epoch": 1.780448717948718, + "grad_norm": 0.016807833686470985, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 1111 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21882987767457962, + "epoch": 1.782051282051282, + "grad_norm": 0.017015958204865456, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3504.0, + "completions/mean_length": 1344.291015625, + "completions/mean_terminated_length": 1285.3118896484375, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "entropy": 0.22031641751527786, + "epoch": 1.7836538461538463, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02029530704021454, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 843814051.0, + "reward": 0.3037402629852295, + "reward_std": 0.06527281552553177, + "rewards/progression_diversity/mean": -1.7595344559140358e-07, + "rewards/progression_diversity/std": 3.981372174166609e-06, + "rewards/symbolic_reward_accuracy/mean": 0.1640625, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.6856445074081421, + "rewards/symbolic_reward_partial_score/std": 0.2108502984046936, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0456528663635254, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.39048463106155396, + "step": 1113 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21349480748176575, + "epoch": 1.7852564102564101, + "grad_norm": 0.019900523126125336, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1114 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.21848982572555542, + "epoch": 1.7868589743589745, + "grad_norm": 0.010702534578740597, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 1115 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.22711623460054398, + "epoch": 1.7884615384615383, + "grad_norm": 0.021489204838871956, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3124.0, + "completions/max_terminated_length": 3124.0, + "completions/mean_length": 1099.703125, + "completions/mean_terminated_length": 1099.703125, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.2246062159538269, + "epoch": 1.7900641025641026, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01770607940852642, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 845220171.0, + "reward": 0.3277079463005066, + "reward_std": 0.05349338799715042, + "rewards/progression_diversity/mean": -0.00020373196457512677, + "rewards/progression_diversity/std": 0.002731936750933528, + "rewards/symbolic_reward_accuracy/mean": 0.1953125, + "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, + "rewards/symbolic_reward_partial_score/mean": 0.7017415761947632, + "rewards/symbolic_reward_partial_score/std": 0.196936696767807, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0474233627319336, + "sampling/importance_sampling_ratio/min": 1.3281011845833746e-08, + "sampling/sampling_logp_difference/max": 18.136930465698242, + "sampling/sampling_logp_difference/mean": 0.09797333925962448, + "step": 1117 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21827614307403564, + "epoch": 1.7916666666666665, + "grad_norm": 0.011651732958853245, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1118 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.21932896971702576, + "epoch": 1.7932692307692308, + "grad_norm": 0.010273098014295101, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 1119 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2230270653963089, + "epoch": 1.7948717948717947, + "grad_norm": 0.02401752397418022, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3138.0, + "completions/max_terminated_length": 3138.0, + "completions/mean_length": 1054.958984375, + "completions/mean_terminated_length": 1054.958984375, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.21830987185239792, + "epoch": 1.796474358974359, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.015644196420907974, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 846662438.0, + "reward": 0.3441348671913147, + "reward_std": 0.06147376075387001, + "rewards/progression_diversity/mean": -8.792350126896054e-05, + "rewards/progression_diversity/std": 0.00172401312738657, + "rewards/symbolic_reward_accuracy/mean": 0.216796875, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.7135253548622131, + "rewards/symbolic_reward_partial_score/std": 0.2144842892885208, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0474189519882202, + "sampling/importance_sampling_ratio/min": 4.838908353121951e-05, + "sampling/sampling_logp_difference/max": 9.936236381530762, + "sampling/sampling_logp_difference/mean": 0.0984996110200882, + "step": 1121 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22250553965568542, + "epoch": 1.7980769230769231, + "grad_norm": 0.020056243985891342, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 1122 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22270743548870087, + "epoch": 1.7996794871794872, + "grad_norm": 0.018208522349596024, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 1123 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.22166861593723297, + "epoch": 1.8012820512820513, + "grad_norm": 0.014248647727072239, + "learning_rate": 1e-06, + "loss": -0.0053, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5645.0, + "completions/max_terminated_length": 5645.0, + "completions/mean_length": 1187.763671875, + "completions/mean_terminated_length": 1187.763671875, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.21534562855958939, + "epoch": 1.8028846153846154, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.024967024102807045, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 848091485.0, + "reward": 0.4134520888328552, + "reward_std": 0.04205480217933655, + "rewards/progression_diversity/mean": -0.0004985664272680879, + "rewards/progression_diversity/std": 0.007882904261350632, + "rewards/symbolic_reward_accuracy/mean": 0.30078125, + "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, + "rewards/symbolic_reward_partial_score/mean": 0.7766276001930237, + "rewards/symbolic_reward_partial_score/std": 0.2088257223367691, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0464260578155518, + "sampling/importance_sampling_ratio/min": 1.6780650184955448e-05, + "sampling/sampling_logp_difference/max": 10.995284080505371, + "sampling/sampling_logp_difference/mean": 0.09675593674182892, + "step": 1125 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22125839442014694, + "epoch": 1.8044871794871795, + "grad_norm": 0.012069555930793285, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 1126 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22001603245735168, + "epoch": 1.8060897435897436, + "grad_norm": 0.014061706140637398, + "learning_rate": 1e-06, + "loss": -0.0032, + "step": 1127 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22054609656333923, + "epoch": 1.8076923076923077, + "grad_norm": 0.011219343170523643, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 8772.0, + "completions/max_terminated_length": 8772.0, + "completions/mean_length": 1137.666015625, + "completions/mean_terminated_length": 1137.666015625, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.21648870408535004, + "epoch": 1.8092948717948718, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.022340567782521248, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 849531970.0, + "reward": 0.3116188943386078, + "reward_std": 0.024817654862999916, + "rewards/progression_diversity/mean": -0.00022150327276904136, + "rewards/progression_diversity/std": 0.004627527203410864, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.6793619394302368, + "rewards/symbolic_reward_partial_score/std": 0.1976500004529953, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0465974807739258, + "sampling/importance_sampling_ratio/min": 5.26275156570658e-19, + "sampling/sampling_logp_difference/max": 42.088462829589844, + "sampling/sampling_logp_difference/mean": 0.09620961546897888, + "step": 1129 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.21649357676506042, + "epoch": 1.810897435897436, + "grad_norm": 0.01756151393055916, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 1130 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21943299472332, + "epoch": 1.8125, + "grad_norm": 0.015073378570377827, + "learning_rate": 1e-06, + "loss": -0.007, + "step": 1131 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2232174053788185, + "epoch": 1.814102564102564, + "grad_norm": 0.01907176896929741, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3176.0, + "completions/max_terminated_length": 3176.0, + "completions/mean_length": 1107.029296875, + "completions/mean_terminated_length": 1107.029296875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.2129407450556755, + "epoch": 1.8157051282051282, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.025003377348184586, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 850939665.0, + "reward": 0.28454291820526123, + "reward_std": 0.03294537216424942, + "rewards/progression_diversity/mean": -0.00029976246878504753, + "rewards/progression_diversity/std": 0.002749260514974594, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.682861328125, + "rewards/symbolic_reward_partial_score/std": 0.1956394761800766, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0451654195785522, + "sampling/importance_sampling_ratio/min": 8.991118960466338e-08, + "sampling/sampling_logp_difference/max": 16.224443435668945, + "sampling/sampling_logp_difference/mean": 0.09530235081911087, + "step": 1133 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.21520614624023438, + "epoch": 1.8173076923076923, + "grad_norm": 0.008025935851037502, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 1134 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.21551941335201263, + "epoch": 1.8189102564102564, + "grad_norm": 0.021345248445868492, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1135 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.21580560505390167, + "epoch": 1.8205128205128205, + "grad_norm": 0.0155746815726161, + "learning_rate": 1e-06, + "loss": 0.0006, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3362.0, + "completions/max_terminated_length": 3362.0, + "completions/mean_length": 1122.947265625, + "completions/mean_terminated_length": 1122.947265625, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 0.21716423332691193, + "epoch": 1.8221153846153846, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02500241994857788, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 852408278.0, + "reward": 0.28805452585220337, + "reward_std": 0.041980091482400894, + "rewards/progression_diversity/mean": -0.00020892325846944004, + "rewards/progression_diversity/std": 0.001957833534106612, + "rewards/symbolic_reward_accuracy/mean": 0.13671875, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.6867513656616211, + "rewards/symbolic_reward_partial_score/std": 0.19679629802703857, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045684576034546, + "sampling/importance_sampling_ratio/min": 4.51097449324922e-12, + "sampling/sampling_logp_difference/max": 26.124507904052734, + "sampling/sampling_logp_difference/mean": 0.09778503328561783, + "step": 1137 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.21945731341838837, + "epoch": 1.8237179487179487, + "grad_norm": 0.013288813643157482, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 1138 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.22035419940948486, + "epoch": 1.8253205128205128, + "grad_norm": 0.025754936039447784, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 1139 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22086691856384277, + "epoch": 1.8269230769230769, + "grad_norm": 0.018110549077391624, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3428.0, + "completions/mean_length": 1185.529296875, + "completions/mean_terminated_length": 1155.7867431640625, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "entropy": 0.22858010977506638, + "epoch": 1.828525641025641, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.029191119596362114, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 853835237.0, + "reward": 0.38153761625289917, + "reward_std": 0.07013949751853943, + "rewards/progression_diversity/mean": -4.912294389214367e-05, + "rewards/progression_diversity/std": 0.0007894500158727169, + "rewards/symbolic_reward_accuracy/mean": 0.275390625, + "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, + "rewards/symbolic_reward_partial_score/mean": 0.7216634154319763, + "rewards/symbolic_reward_partial_score/std": 0.22831861674785614, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.046138048171997, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 472.0, + "sampling/sampling_logp_difference/mean": 0.10319283604621887, + "step": 1141 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2211167812347412, + "epoch": 1.8301282051282053, + "grad_norm": 0.02721370942890644, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2238641008734703, + "epoch": 1.8317307692307692, + "grad_norm": 0.02170705609023571, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 1143 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.23243413865566254, + "epoch": 1.8333333333333335, + "grad_norm": 0.014753867872059345, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3178.0, + "completions/mean_length": 1113.748046875, + "completions/mean_terminated_length": 1083.864990234375, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "entropy": 0.22820258885622025, + "epoch": 1.8349358974358974, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.017704647034406662, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 855269764.0, + "reward": 0.2694726586341858, + "reward_std": 0.026778917759656906, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.126953125, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.6449869871139526, + "rewards/symbolic_reward_partial_score/std": 0.22033336758613586, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0469450950622559, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 720.0, + "sampling/sampling_logp_difference/mean": 0.3227856755256653, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.22650671005249023, + "epoch": 1.8365384615384617, + "grad_norm": 199.25045776367188, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 1146 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22604414075613022, + "epoch": 1.8381410256410255, + "grad_norm": 0.009444105438888073, + "learning_rate": 1e-06, + "loss": -0.0039, + "step": 1147 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.22440266609191895, + "epoch": 1.8397435897435899, + "grad_norm": 0.01778390072286129, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3424.0, + "completions/mean_length": 1209.474609375, + "completions/mean_terminated_length": 1120.037353515625, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.22518840432167053, + "epoch": 1.8413461538461537, + "frac_reward_zero_std": 0.4375, + "grad_norm": 323.1822814941406, + "learning_rate": 1e-06, + "loss": 0.0188, + "num_tokens": 856870839.0, + "reward": 0.28326651453971863, + "reward_std": 0.05484289303421974, + "rewards/progression_diversity/mean": -0.0004978624056093395, + "rewards/progression_diversity/std": 0.009513349272310734, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.6402018070220947, + "rewards/symbolic_reward_partial_score/std": 0.2216087281703949, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0432510375976562, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.484938621520996, + "step": 1149 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.21288639307022095, + "epoch": 1.842948717948718, + "grad_norm": 1597.748046875, + "learning_rate": 1e-06, + "loss": 0.155, + "step": 1150 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22490354627370834, + "epoch": 1.844551282051282, + "grad_norm": 0.015957126393914223, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 1151 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22101520746946335, + "epoch": 1.8461538461538463, + "grad_norm": 0.017633656039834023, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3059.0, + "completions/max_terminated_length": 3059.0, + "completions/mean_length": 1138.349609375, + "completions/mean_terminated_length": 1138.349609375, + "completions/min_length": 356.0, + "completions/min_terminated_length": 356.0, + "entropy": 0.23107396066188812, + "epoch": 1.8477564102564101, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.022070132195949554, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 858210874.0, + "reward": 0.38855424523353577, + "reward_std": 0.0762196034193039, + "rewards/progression_diversity/mean": -4.658957914216444e-05, + "rewards/progression_diversity/std": 0.0010542018571868539, + "rewards/symbolic_reward_accuracy/mean": 0.267578125, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.7600260972976685, + "rewards/symbolic_reward_partial_score/std": 0.22460278868675232, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0477694272994995, + "sampling/importance_sampling_ratio/min": 7.687971810810268e-05, + "sampling/sampling_logp_difference/max": 9.473268508911133, + "sampling/sampling_logp_difference/mean": 0.10008453577756882, + "step": 1153 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22674327343702316, + "epoch": 1.8493589743589745, + "grad_norm": 0.015364538878202438, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 1154 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.22742107510566711, + "epoch": 1.8509615384615383, + "grad_norm": 0.012995360419154167, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 1155 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.22633031755685806, + "epoch": 1.8525641025641026, + "grad_norm": 0.015840977430343628, + "learning_rate": 1e-06, + "loss": -0.0093, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3445.0, + "completions/mean_length": 1179.333984375, + "completions/mean_terminated_length": 1119.7078857421875, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.22518686205148697, + "epoch": 1.8541666666666665, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.025853393599390984, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 859689013.0, + "reward": 0.35142046213150024, + "reward_std": 0.06578496098518372, + "rewards/progression_diversity/mean": -0.0005352114676497877, + "rewards/progression_diversity/std": 0.010669847950339317, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.7508463263511658, + "rewards/symbolic_reward_partial_score/std": 0.1848256140947342, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0448198318481445, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.6326540112495422, + "step": 1157 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.21813210099935532, + "epoch": 1.8557692307692308, + "grad_norm": 0.018281714990735054, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 1158 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.22226067632436752, + "epoch": 1.8573717948717947, + "grad_norm": 0.018198495730757713, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 1159 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.22661340981721878, + "epoch": 1.858974358974359, + "grad_norm": 0.010464332066476345, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3748.0, + "completions/mean_length": 1216.416015625, + "completions/mean_terminated_length": 1186.73388671875, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 0.22897808998823166, + "epoch": 1.8605769230769231, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.018317364156246185, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 861117626.0, + "reward": 0.27255719900131226, + "reward_std": 0.025438256561756134, + "rewards/progression_diversity/mean": -0.00014033068146090955, + "rewards/progression_diversity/std": 0.002014033030718565, + "rewards/symbolic_reward_accuracy/mean": 0.12890625, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.6513671875, + "rewards/symbolic_reward_partial_score/std": 0.2093709260225296, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0455472469329834, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 0.44371387362480164, + "step": 1161 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.22330108284950256, + "epoch": 1.8621794871794872, + "grad_norm": 0.010753365233540535, + "learning_rate": 1e-06, + "loss": 0.0224, + "step": 1162 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.22619932144880295, + "epoch": 1.8637820512820513, + "grad_norm": 0.012639653868973255, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 1163 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2241613268852234, + "epoch": 1.8653846153846154, + "grad_norm": 0.02455468475818634, + "learning_rate": 1e-06, + "loss": 0.0086, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3400.0, + "completions/max_terminated_length": 3400.0, + "completions/mean_length": 1240.51171875, + "completions/mean_terminated_length": 1240.51171875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "entropy": 0.23062631487846375, + "epoch": 1.8669871794871795, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.010606328025460243, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 862543008.0, + "reward": 0.3666699528694153, + "reward_std": 0.06977473199367523, + "rewards/progression_diversity/mean": -1.885928213596344e-07, + "rewards/progression_diversity/std": 4.2673682401073165e-06, + "rewards/symbolic_reward_accuracy/mean": 0.251953125, + "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, + "rewards/symbolic_reward_partial_score/mean": 0.7183268070220947, + "rewards/symbolic_reward_partial_score/std": 0.23471878468990326, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.047785997390747, + "sampling/importance_sampling_ratio/min": 6.825915477293165e-08, + "sampling/sampling_logp_difference/max": 16.499954223632812, + "sampling/sampling_logp_difference/mean": 0.09932725131511688, + "step": 1165 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.224430114030838, + "epoch": 1.8685897435897436, + "grad_norm": 0.021581292152404785, + "learning_rate": 1e-06, + "loss": -0.0059, + "step": 1166 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22753822803497314, + "epoch": 1.8701923076923077, + "grad_norm": 0.014603802002966404, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.22805190086364746, + "epoch": 1.8717948717948718, + "grad_norm": 0.010613877326250076, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3325.0, + "completions/mean_length": 1174.26953125, + "completions/mean_terminated_length": 1144.5048828125, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.2323755919933319, + "epoch": 1.873397435897436, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.02169196680188179, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 863941530.0, + "reward": 0.3810667097568512, + "reward_std": 0.07798746973276138, + "rewards/progression_diversity/mean": -0.0002637706929817796, + "rewards/progression_diversity/std": 0.003273366717621684, + "rewards/symbolic_reward_accuracy/mean": 0.279296875, + "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, + "rewards/symbolic_reward_partial_score/mean": 0.7122883796691895, + "rewards/symbolic_reward_partial_score/std": 0.2458869218826294, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0482022762298584, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 712.0, + "sampling/sampling_logp_difference/mean": 0.4974533021450043, + "step": 1169 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23275116831064224, + "epoch": 1.875, + "grad_norm": 0.011001808568835258, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 1170 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.23044289648532867, + "epoch": 1.876602564102564, + "grad_norm": 0.016635850071907043, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 1171 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.23118175566196442, + "epoch": 1.8782051282051282, + "grad_norm": 96598.3203125, + "learning_rate": 1e-06, + "loss": 4.3159, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4005.0, + "completions/mean_length": 1170.333984375, + "completions/mean_terminated_length": 1140.5616455078125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 0.22949066758155823, + "epoch": 1.8798076923076923, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.020497862249612808, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 865447381.0, + "reward": 0.44851863384246826, + "reward_std": 0.04487219825387001, + "rewards/progression_diversity/mean": -0.00019038662139791995, + "rewards/progression_diversity/std": 0.0023014748003333807, + "rewards/symbolic_reward_accuracy/mean": 0.357421875, + "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, + "rewards/symbolic_reward_partial_score/mean": 0.7808756828308105, + "rewards/symbolic_reward_partial_score/std": 0.20934894680976868, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0473530292510986, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.7223914265632629, + "step": 1173 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.23133239150047302, + "epoch": 1.8814102564102564, + "grad_norm": 0.010313909500837326, + "learning_rate": 1e-06, + "loss": -0.0036, + "step": 1174 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.23350611329078674, + "epoch": 1.8830128205128205, + "grad_norm": 0.019430264830589294, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 1175 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2238868772983551, + "epoch": 1.8846153846153846, + "grad_norm": 0.013273504562675953, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3514.0, + "completions/mean_length": 1243.806640625, + "completions/mean_terminated_length": 1154.57177734375, + "completions/min_length": 389.0, + "completions/min_terminated_length": 389.0, + "entropy": 0.2321956604719162, + "epoch": 1.8862179487179487, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.014576968736946583, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 866961778.0, + "reward": 0.20335838198661804, + "reward_std": 0.03265673667192459, + "rewards/progression_diversity/mean": -0.0005896041984669864, + "rewards/progression_diversity/std": 0.008867304772138596, + "rewards/symbolic_reward_accuracy/mean": 0.03125, + "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, + "rewards/symbolic_reward_partial_score/mean": 0.6173340082168579, + "rewards/symbolic_reward_partial_score/std": 0.1687249392271042, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0458412170410156, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.8149086236953735, + "step": 1177 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22551760077476501, + "epoch": 1.8878205128205128, + "grad_norm": 0.017989974468946457, + "learning_rate": 1e-06, + "loss": 0.0529, + "step": 1178 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2227119281888008, + "epoch": 1.8894230769230769, + "grad_norm": 0.02180834859609604, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 1179 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22573918104171753, + "epoch": 1.891025641025641, + "grad_norm": 126.00947570800781, + "learning_rate": 1e-06, + "loss": 0.0381, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3086.0, + "completions/mean_length": 1103.10546875, + "completions/mean_terminated_length": 1073.2015380859375, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.22882883995771408, + "epoch": 1.8926282051282053, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.019178207963705063, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 868483272.0, + "reward": 0.3014499545097351, + "reward_std": 0.037228312343358994, + "rewards/progression_diversity/mean": -0.001001341617666185, + "rewards/progression_diversity/std": 0.01981303095817566, + "rewards/symbolic_reward_accuracy/mean": 0.171875, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.6617676019668579, + "rewards/symbolic_reward_partial_score/std": 0.21480776369571686, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0480608940124512, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.56421959400177, + "step": 1181 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.22795479744672775, + "epoch": 1.8942307692307692, + "grad_norm": 0.011316453106701374, + "learning_rate": 1e-06, + "loss": -0.0061, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.22722984105348587, + "epoch": 1.8958333333333335, + "grad_norm": 0.016038935631513596, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1183 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.22345466166734695, + "epoch": 1.8974358974358974, + "grad_norm": 0.0241796113550663, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3469.0, + "completions/max_terminated_length": 3469.0, + "completions/mean_length": 1094.568359375, + "completions/mean_terminated_length": 1094.568359375, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.23068898171186447, + "epoch": 1.8990384615384617, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.025562407448887825, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 869886539.0, + "reward": 0.2939690053462982, + "reward_std": 0.05816289782524109, + "rewards/progression_diversity/mean": -7.342880417127162e-05, + "rewards/progression_diversity/std": 0.0009783387649804354, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.7142741084098816, + "rewards/symbolic_reward_partial_score/std": 0.16729366779327393, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0498249530792236, + "sampling/importance_sampling_ratio/min": 0.002071201568469405, + "sampling/sampling_logp_difference/max": 6.17962646484375, + "sampling/sampling_logp_difference/mean": 0.10211198776960373, + "step": 1185 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2281019389629364, + "epoch": 1.9006410256410255, + "grad_norm": 0.014062655158340931, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 1186 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.22974391281604767, + "epoch": 1.9022435897435899, + "grad_norm": 0.014239110052585602, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 1187 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.22985130548477173, + "epoch": 1.9038461538461537, + "grad_norm": 0.015234572812914848, + "learning_rate": 1e-06, + "loss": -0.0055, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3333.0, + "completions/max_terminated_length": 3333.0, + "completions/mean_length": 1139.83984375, + "completions/mean_terminated_length": 1139.83984375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.22518738359212875, + "epoch": 1.905448717948718, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.030008932575583458, + "learning_rate": 1e-06, + "loss": 0.0136, + "num_tokens": 871431865.0, + "reward": 0.3191036581993103, + "reward_std": 0.04677165299654007, + "rewards/progression_diversity/mean": -0.0002812910242937505, + "rewards/progression_diversity/std": 0.0030721058137714863, + "rewards/symbolic_reward_accuracy/mean": 0.193359375, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.6769694089889526, + "rewards/symbolic_reward_partial_score/std": 0.21019434928894043, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.048149585723877, + "sampling/importance_sampling_ratio/min": 7.317296608670226e-17, + "sampling/sampling_logp_difference/max": 37.15370559692383, + "sampling/sampling_logp_difference/mean": 0.09954197704792023, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.22382111847400665, + "epoch": 1.907051282051282, + "grad_norm": 0.015485767275094986, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 1190 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22165128588676453, + "epoch": 1.9086538461538463, + "grad_norm": 0.019756661728024483, + "learning_rate": 1e-06, + "loss": -0.0091, + "step": 1191 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22612737119197845, + "epoch": 1.9102564102564101, + "grad_norm": 0.020691078156232834, + "learning_rate": 1e-06, + "loss": -0.0043, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3072.0, + "completions/mean_length": 1140.0625, + "completions/mean_terminated_length": 1140.0625, + "completions/min_length": 410.0, + "completions/min_terminated_length": 410.0, + "entropy": 0.22674325108528137, + "epoch": 1.9118589743589745, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.016689619049429893, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 872902937.0, + "reward": 0.26588261127471924, + "reward_std": 0.033439576625823975, + "rewards/progression_diversity/mean": -0.0001180602703243494, + "rewards/progression_diversity/std": 0.0016232561320066452, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.644091784954071, + "rewards/symbolic_reward_partial_score/std": 0.22011272609233856, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.048423171043396, + "sampling/importance_sampling_ratio/min": 0.0005005777347832918, + "sampling/sampling_logp_difference/max": 7.599747657775879, + "sampling/sampling_logp_difference/mean": 0.10049919784069061, + "step": 1193 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22232681512832642, + "epoch": 1.9134615384615383, + "grad_norm": 0.019354144111275673, + "learning_rate": 1e-06, + "loss": -0.0067, + "step": 1194 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.22689828276634216, + "epoch": 1.9150641025641026, + "grad_norm": 0.015032010152935982, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 1195 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2218078002333641, + "epoch": 1.9166666666666665, + "grad_norm": 0.015426430851221085, + "learning_rate": 1e-06, + "loss": -0.0044, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3017.0, + "completions/mean_length": 1175.25, + "completions/mean_terminated_length": 1115.60791015625, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.22400058060884476, + "epoch": 1.9182692307692308, + "frac_reward_zero_std": 0.53125, + "grad_norm": 994.896484375, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 874303465.0, + "reward": 0.30390554666519165, + "reward_std": 0.03565283119678497, + "rewards/progression_diversity/mean": -7.151110912673175e-05, + "rewards/progression_diversity/std": 0.0012204337399452925, + "rewards/symbolic_reward_accuracy/mean": 0.169921875, + "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, + "rewards/symbolic_reward_partial_score/mean": 0.6744791269302368, + "rewards/symbolic_reward_partial_score/std": 0.20566026866436005, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0463134050369263, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 724.0, + "sampling/sampling_logp_difference/mean": 0.4934929311275482, + "step": 1197 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2278030663728714, + "epoch": 1.9198717948717947, + "grad_norm": 0.0072776079177856445, + "learning_rate": 1e-06, + "loss": -0.0058, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.22660139203071594, + "epoch": 1.921474358974359, + "grad_norm": 0.011884551495313644, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 1199 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.22588203847408295, + "epoch": 1.9230769230769231, + "grad_norm": 0.008959163911640644, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3548.0, + "completions/max_terminated_length": 3548.0, + "completions/mean_length": 1056.767578125, + "completions/mean_terminated_length": 1056.767578125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.22032135725021362, + "epoch": 1.9246794871794872, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.01947578601539135, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 875797554.0, + "reward": 0.24959278106689453, + "reward_std": 0.034925058484077454, + "rewards/progression_diversity/mean": -0.00019414318376220763, + "rewards/progression_diversity/std": 0.0018961295718327165, + "rewards/symbolic_reward_accuracy/mean": 0.09765625, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.6366698741912842, + "rewards/symbolic_reward_partial_score/std": 0.21680741012096405, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0476453304290771, + "sampling/importance_sampling_ratio/min": 1.6844656158809812e-07, + "sampling/sampling_logp_difference/max": 15.596647262573242, + "sampling/sampling_logp_difference/mean": 0.0985308587551117, + "step": 1201 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.21852584183216095, + "epoch": 1.9262820512820513, + "grad_norm": 0.02164105512201786, + "learning_rate": 1e-06, + "loss": -0.0057, + "step": 1202 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.21969875693321228, + "epoch": 1.9278846153846154, + "grad_norm": 0.013584469445049763, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 1203 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2164856493473053, + "epoch": 1.9294871794871795, + "grad_norm": 0.01743365451693535, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3204.0, + "completions/max_terminated_length": 3204.0, + "completions/mean_length": 1141.201171875, + "completions/mean_terminated_length": 1141.201171875, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.2209923416376114, + "epoch": 1.9310897435897436, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.022249111905694008, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 877274857.0, + "reward": 0.3205749988555908, + "reward_std": 0.028882907703518867, + "rewards/progression_diversity/mean": -0.00011996681860182434, + "rewards/progression_diversity/std": 0.0014987426111474633, + "rewards/symbolic_reward_accuracy/mean": 0.17578125, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.7170247435569763, + "rewards/symbolic_reward_partial_score/std": 0.19960245490074158, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0480538606643677, + "sampling/importance_sampling_ratio/min": 6.5602007333354084e-12, + "sampling/sampling_logp_difference/max": 25.75, + "sampling/sampling_logp_difference/mean": 0.09933799505233765, + "step": 1205 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.21989618986845016, + "epoch": 1.9326923076923077, + "grad_norm": 0.008175384253263474, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 1206 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.22836829721927643, + "epoch": 1.9342948717948718, + "grad_norm": 0.016739025712013245, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1207 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2236020788550377, + "epoch": 1.935897435897436, + "grad_norm": 0.013731605373322964, + "learning_rate": 1e-06, + "loss": 0.0002, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3453.0, + "completions/mean_length": 1140.861328125, + "completions/mean_terminated_length": 1111.03125, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.2177734076976776, + "epoch": 1.9375, + "frac_reward_zero_std": 0.40625, + "grad_norm": 574.372314453125, + "learning_rate": 1e-06, + "loss": 0.0217, + "num_tokens": 878788738.0, + "reward": 0.20117032527923584, + "reward_std": 0.02102627232670784, + "rewards/progression_diversity/mean": -0.00015734444605186582, + "rewards/progression_diversity/std": 0.002003313507884741, + "rewards/symbolic_reward_accuracy/mean": 0.0234375, + "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, + "rewards/symbolic_reward_partial_score/mean": 0.6243489980697632, + "rewards/symbolic_reward_partial_score/std": 0.15533775091171265, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0462570190429688, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.6281495690345764, + "step": 1209 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.22087281942367554, + "epoch": 1.939102564102564, + "grad_norm": 0.0154855502769351, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 1210 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22198112308979034, + "epoch": 1.9407051282051282, + "grad_norm": 0.018509745597839355, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1211 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22084571421146393, + "epoch": 1.9423076923076923, + "grad_norm": 0.021920692175626755, + "learning_rate": 1e-06, + "loss": -0.0027, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3133.0, + "completions/mean_length": 1260.185546875, + "completions/mean_terminated_length": 1200.8765869140625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.22076184302568436, + "epoch": 1.9439102564102564, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.016443662345409393, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 880305873.0, + "reward": 0.2951447665691376, + "reward_std": 0.037818461656570435, + "rewards/progression_diversity/mean": -0.00017372408183291554, + "rewards/progression_diversity/std": 0.002410082146525383, + "rewards/symbolic_reward_accuracy/mean": 0.1484375, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.6882486939430237, + "rewards/symbolic_reward_partial_score/std": 0.20156751573085785, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0463939905166626, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.4395545721054077, + "step": 1213 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22432439774274826, + "epoch": 1.9455128205128205, + "grad_norm": 752.1104125976562, + "learning_rate": 1e-06, + "loss": 0.0734, + "step": 1214 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.22369728982448578, + "epoch": 1.9471153846153846, + "grad_norm": 0.005198925733566284, + "learning_rate": 1e-06, + "loss": 0.0301, + "step": 1215 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.22393931448459625, + "epoch": 1.9487179487179487, + "grad_norm": 0.01498700212687254, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2988.0, + "completions/max_terminated_length": 2988.0, + "completions/mean_length": 1089.60546875, + "completions/mean_terminated_length": 1089.60546875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.2190534770488739, + "epoch": 1.9503205128205128, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02252928353846073, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 881729991.0, + "reward": 0.28866058588027954, + "reward_std": 0.05070298910140991, + "rewards/progression_diversity/mean": -0.00015254000027198344, + "rewards/progression_diversity/std": 0.0022601871751248837, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.7278319597244263, + "rewards/symbolic_reward_partial_score/std": 0.17303742468357086, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0459058284759521, + "sampling/importance_sampling_ratio/min": 2.218274985921198e-08, + "sampling/sampling_logp_difference/max": 17.623950958251953, + "sampling/sampling_logp_difference/mean": 0.09728400409221649, + "step": 1217 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21607843786478043, + "epoch": 1.9519230769230769, + "grad_norm": 0.030551020056009293, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1218 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2140772044658661, + "epoch": 1.953525641025641, + "grad_norm": 0.009000630117952824, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 1219 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.21863539516925812, + "epoch": 1.9551282051282053, + "grad_norm": 0.01365700364112854, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2953.0, + "completions/max_terminated_length": 2953.0, + "completions/mean_length": 1145.857421875, + "completions/mean_terminated_length": 1145.857421875, + "completions/min_length": 401.0, + "completions/min_terminated_length": 401.0, + "entropy": 0.2172141596674919, + "epoch": 1.9567307692307692, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.024405311793088913, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 883216462.0, + "reward": 0.3235936760902405, + "reward_std": 0.029494676738977432, + "rewards/progression_diversity/mean": -8.07223841547966e-06, + "rewards/progression_diversity/std": 0.00018265389371663332, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.7192708253860474, + "rewards/symbolic_reward_partial_score/std": 0.19232387840747833, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.046884298324585, + "sampling/importance_sampling_ratio/min": 1.05600235755735e-20, + "sampling/sampling_logp_difference/max": 45.99721145629883, + "sampling/sampling_logp_difference/mean": 0.09837117791175842, + "step": 1221 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.22011933475732803, + "epoch": 1.9583333333333335, + "grad_norm": 0.00895681418478489, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 1222 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.21496566385030746, + "epoch": 1.9599358974358974, + "grad_norm": 0.014537299983203411, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1223 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.21989809721708298, + "epoch": 1.9615384615384617, + "grad_norm": 0.008813065476715565, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3093.0, + "completions/max_terminated_length": 3093.0, + "completions/mean_length": 1287.013671875, + "completions/mean_terminated_length": 1287.013671875, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "entropy": 0.21527937054634094, + "epoch": 1.9631410256410255, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.017107542604207993, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 884739141.0, + "reward": 0.28098633885383606, + "reward_std": 0.0465182401239872, + "rewards/progression_diversity/mean": -1.3702083379030228e-06, + "rewards/progression_diversity/std": 3.1004274205770344e-05, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.6866210699081421, + "rewards/symbolic_reward_partial_score/std": 0.21217504143714905, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0459423065185547, + "sampling/importance_sampling_ratio/min": 9.484894411184965e-19, + "sampling/sampling_logp_difference/max": 41.49941635131836, + "sampling/sampling_logp_difference/mean": 0.0953700914978981, + "step": 1225 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.21589966118335724, + "epoch": 1.9647435897435899, + "grad_norm": 0.01544854324311018, + "learning_rate": 1e-06, + "loss": -0.0047, + "step": 1226 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.21224521100521088, + "epoch": 1.9663461538461537, + "grad_norm": 0.016370629891753197, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 1227 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.21077106148004532, + "epoch": 1.967948717948718, + "grad_norm": 0.009457322768867016, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3146.0, + "completions/max_terminated_length": 3146.0, + "completions/mean_length": 1206.787109375, + "completions/mean_terminated_length": 1206.787109375, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.21696603298187256, + "epoch": 1.969551282051282, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.01737239398062229, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 886241192.0, + "reward": 0.41241925954818726, + "reward_std": 0.028087418526411057, + "rewards/progression_diversity/mean": -0.00026385916862636805, + "rewards/progression_diversity/std": 0.002647166606038809, + "rewards/symbolic_reward_accuracy/mean": 0.322265625, + "rewards/symbolic_reward_accuracy/std": 0.46780112385749817, + "rewards/symbolic_reward_partial_score/mean": 0.7302083969116211, + "rewards/symbolic_reward_partial_score/std": 0.22925032675266266, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0460162162780762, + "sampling/importance_sampling_ratio/min": 0.00035160055267624557, + "sampling/sampling_logp_difference/max": 7.953014850616455, + "sampling/sampling_logp_difference/mean": 0.09595166146755219, + "step": 1229 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.21299895644187927, + "epoch": 1.9711538461538463, + "grad_norm": 0.013916675001382828, + "learning_rate": 1e-06, + "loss": 0.0016, + "step": 1230 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2122596800327301, + "epoch": 1.9727564102564101, + "grad_norm": 0.013204848393797874, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2141348496079445, + "epoch": 1.9743589743589745, + "grad_norm": 0.015932990238070488, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3633.0, + "completions/mean_length": 1235.24609375, + "completions/mean_terminated_length": 1205.600830078125, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "entropy": 0.21588806062936783, + "epoch": 1.9759615384615383, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.016500849276781082, + "learning_rate": 1e-06, + "loss": 0.0061, + "num_tokens": 887740710.0, + "reward": 0.3222983777523041, + "reward_std": 0.06374066323041916, + "rewards/progression_diversity/mean": -0.00014570857456419617, + "rewards/progression_diversity/std": 0.0015578385209664702, + "rewards/symbolic_reward_accuracy/mean": 0.19140625, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.6921712160110474, + "rewards/symbolic_reward_partial_score/std": 0.2026739865541458, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045090913772583, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.3644917607307434, + "step": 1233 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.21097960323095322, + "epoch": 1.9775641025641026, + "grad_norm": 0.0278862826526165, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 1234 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2159006968140602, + "epoch": 1.9791666666666665, + "grad_norm": 0.014327644370496273, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 1235 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2141767367720604, + "epoch": 1.9807692307692308, + "grad_norm": 0.017790069803595543, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3288.0, + "completions/max_terminated_length": 3288.0, + "completions/mean_length": 1284.09375, + "completions/mean_terminated_length": 1284.09375, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.210280142724514, + "epoch": 1.9823717948717947, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.021001746878027916, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 889325798.0, + "reward": 0.2985340356826782, + "reward_std": 0.05770270898938179, + "rewards/progression_diversity/mean": -0.00011500486289151013, + "rewards/progression_diversity/std": 0.0018274638568982482, + "rewards/symbolic_reward_accuracy/mean": 0.142578125, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.7099609375, + "rewards/symbolic_reward_partial_score/std": 0.19229154288768768, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0444259643554688, + "sampling/importance_sampling_ratio/min": 6.935766052899004e-22, + "sampling/sampling_logp_difference/max": 48.72018051147461, + "sampling/sampling_logp_difference/mean": 0.09380759298801422, + "step": 1237 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.20352954417467117, + "epoch": 1.983974358974359, + "grad_norm": 0.02166086994111538, + "learning_rate": 1e-06, + "loss": -0.0037, + "step": 1238 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.20602308958768845, + "epoch": 1.9855769230769231, + "grad_norm": 0.017689600586891174, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 1239 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.20504777133464813, + "epoch": 1.9871794871794872, + "grad_norm": 0.01678495481610298, + "learning_rate": 1e-06, + "loss": -0.0135, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3242.0, + "completions/max_terminated_length": 3242.0, + "completions/mean_length": 1257.923828125, + "completions/mean_terminated_length": 1257.923828125, + "completions/min_length": 450.0, + "completions/min_terminated_length": 450.0, + "entropy": 0.20558489859104156, + "epoch": 1.9887820512820513, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.024906298145651817, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 890897599.0, + "reward": 0.18425101041793823, + "reward_std": 0.02554313838481903, + "rewards/progression_diversity/mean": -0.00019268083269707859, + "rewards/progression_diversity/std": 0.0028432749677449465, + "rewards/symbolic_reward_accuracy/mean": 0.005859375, + "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, + "rewards/symbolic_reward_partial_score/mean": 0.6024577021598816, + "rewards/symbolic_reward_partial_score/std": 0.18746401369571686, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0438592433929443, + "sampling/importance_sampling_ratio/min": 0.0005599469877779484, + "sampling/sampling_logp_difference/max": 7.487668514251709, + "sampling/sampling_logp_difference/mean": 0.09330444037914276, + "step": 1241 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.20575018227100372, + "epoch": 1.9903846153846154, + "grad_norm": 0.01447804644703865, + "learning_rate": 1e-06, + "loss": 0.0037, + "step": 1242 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.208658367395401, + "epoch": 1.9919871794871795, + "grad_norm": 0.026018286123871803, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1243 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.20609137415885925, + "epoch": 1.9935897435897436, + "grad_norm": 0.015114527195692062, + "learning_rate": 1e-06, + "loss": -0.0032, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3097.0, + "completions/mean_length": 1268.546875, + "completions/mean_terminated_length": 1238.9666748046875, + "completions/min_length": 388.0, + "completions/min_terminated_length": 388.0, + "entropy": 0.19605816900730133, + "epoch": 1.9951923076923077, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.026206906884908676, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 892442183.0, + "reward": 0.3488447368144989, + "reward_std": 0.04794444516301155, + "rewards/progression_diversity/mean": -0.000291308737359941, + "rewards/progression_diversity/std": 0.0030992666725069284, + "rewards/symbolic_reward_accuracy/mean": 0.2265625, + "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, + "rewards/symbolic_reward_partial_score/mean": 0.7097005248069763, + "rewards/symbolic_reward_partial_score/std": 0.21007077395915985, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0414423942565918, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.47119155526161194, + "step": 1245 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.19719059765338898, + "epoch": 1.9967948717948718, + "grad_norm": 0.014178547076880932, + "learning_rate": 1e-06, + "loss": -0.0034, + "step": 1246 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.19025534391403198, + "epoch": 1.998397435897436, + "grad_norm": 0.00976389180868864, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 1247 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.19911998510360718, + "epoch": 2.0, + "grad_norm": 0.021017316728830338, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 1248 + }, + { + "epoch": 2.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.00244140625, + "eval_completions/max_length": 6023.21875, + "eval_completions/max_terminated_length": 3145.0, + "eval_completions/mean_length": 1369.302734375, + "eval_completions/mean_terminated_length": 1332.704662322998, + "eval_completions/min_length": 460.25, + "eval_completions/min_terminated_length": 460.25, + "eval_entropy": 0.19390225457027555, + "eval_frac_reward_zero_std": 0.28125, + "eval_loss": 0.00268191983923316, + "eval_num_tokens": 892442183.0, + "eval_reward": 0.23870036005973816, + "eval_reward_std": 0.03971031281980686, + "eval_rewards/progression_diversity/mean": -0.0003259587915636075, + "eval_rewards/progression_diversity/std": 0.002640488281031139, + "eval_rewards/symbolic_reward_accuracy/mean": 0.079345703125, + "eval_rewards/symbolic_reward_accuracy/std": 0.19223300064913929, + "eval_rewards/symbolic_reward_partial_score/mean": 0.6377197187393904, + "eval_rewards/symbolic_reward_partial_score/std": 0.1931128588039428, + "eval_rewards/tag_count_reward/mean": -0.002197265625, + "eval_rewards/tag_count_reward/std": 0.019685723586007953, + "eval_runtime": 1580.0116, + "eval_samples_per_second": 0.158, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0408074893057346, + "eval_sampling/importance_sampling_ratio/min": 0.00181254785630403, + "eval_sampling/sampling_logp_difference/max": 151.67179602384567, + "eval_sampling/sampling_logp_difference/mean": 0.2478228227701038, + "eval_steps_per_second": 0.001, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3355.0, + "completions/mean_length": 1373.986328125, + "completions/mean_terminated_length": 1315.1236572265625, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.20005454123020172, + "epoch": 2.0016025641025643, + "frac_reward_zero_std": 0.15625, + "grad_norm": 233.30934143066406, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 893940272.0, + "reward": 0.36688268184661865, + "reward_std": 0.056143928319215775, + "rewards/progression_diversity/mean": -0.001186927082017064, + "rewards/progression_diversity/std": 0.01717980019748211, + "rewards/symbolic_reward_accuracy/mean": 0.240234375, + "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, + "rewards/symbolic_reward_partial_score/mean": 0.7431640625, + "rewards/symbolic_reward_partial_score/std": 0.20910829305648804, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0366449356079102, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 2.187889575958252, + "step": 1249 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.19553004205226898, + "epoch": 2.003205128205128, + "grad_norm": 0.019861171022057533, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 1250 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.19362886995077133, + "epoch": 2.0048076923076925, + "grad_norm": 0.02055576629936695, + "learning_rate": 1e-06, + "loss": 0.0059, + "step": 1251 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.19335567951202393, + "epoch": 2.0064102564102564, + "grad_norm": 0.017372747883200645, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3457.0, + "completions/mean_length": 1377.3046875, + "completions/mean_terminated_length": 1347.9373779296875, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.1914682313799858, + "epoch": 2.0080128205128207, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.020828014239668846, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 895523628.0, + "reward": 0.28705087304115295, + "reward_std": 0.06678985804319382, + "rewards/progression_diversity/mean": -0.00047905254177749157, + "rewards/progression_diversity/std": 0.005628896411508322, + "rewards/symbolic_reward_accuracy/mean": 0.13671875, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.6840657591819763, + "rewards/symbolic_reward_partial_score/std": 0.20577622950077057, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0396616458892822, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.37476086616516113, + "step": 1253 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.19290538132190704, + "epoch": 2.0096153846153846, + "grad_norm": 1153.7135009765625, + "learning_rate": 1e-06, + "loss": 0.144, + "step": 1254 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.19067180156707764, + "epoch": 2.011217948717949, + "grad_norm": 0.014077689498662949, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 1255 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.19498515874147415, + "epoch": 2.0128205128205128, + "grad_norm": 0.017978975549340248, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3308.0, + "completions/mean_length": 1435.3828125, + "completions/mean_terminated_length": 1376.7608642578125, + "completions/min_length": 418.0, + "completions/min_terminated_length": 418.0, + "entropy": 0.19777005165815353, + "epoch": 2.014423076923077, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.028694914653897285, + "learning_rate": 1e-06, + "loss": 0.0114, + "num_tokens": 897164496.0, + "reward": 0.3483375608921051, + "reward_std": 0.07038730382919312, + "rewards/progression_diversity/mean": -0.0007183193229138851, + "rewards/progression_diversity/std": 0.0077925813384354115, + "rewards/symbolic_reward_accuracy/mean": 0.232421875, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.6976073980331421, + "rewards/symbolic_reward_partial_score/std": 0.23062647879123688, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0391186475753784, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.5552532076835632, + "step": 1257 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.19494935125112534, + "epoch": 2.016025641025641, + "grad_norm": 0.016217295080423355, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 1258 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.1985015571117401, + "epoch": 2.0176282051282053, + "grad_norm": 0.02569577842950821, + "learning_rate": 1e-06, + "loss": 0.0176, + "step": 1259 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.1980082392692566, + "epoch": 2.019230769230769, + "grad_norm": 0.01939314603805542, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3630.0, + "completions/mean_length": 1615.064453125, + "completions/mean_terminated_length": 1469.4141845703125, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.18866483867168427, + "epoch": 2.0208333333333335, + "frac_reward_zero_std": 0.375, + "grad_norm": 20.04486656188965, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 898821809.0, + "reward": 0.36717134714126587, + "reward_std": 0.06693032383918762, + "rewards/progression_diversity/mean": -0.0011277215089648962, + "rewards/progression_diversity/std": 0.00911035481840372, + "rewards/symbolic_reward_accuracy/mean": 0.244140625, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.7389160394668579, + "rewards/symbolic_reward_partial_score/std": 0.21004585921764374, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0375559329986572, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.6628260612487793, + "step": 1261 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.18701110780239105, + "epoch": 2.0224358974358974, + "grad_norm": 0.018055502325296402, + "learning_rate": 1e-06, + "loss": 0.02, + "step": 1262 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.18735942244529724, + "epoch": 2.0240384615384617, + "grad_norm": 0.02284780703485012, + "learning_rate": 1e-06, + "loss": 0.0494, + "step": 1263 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.1921609714627266, + "epoch": 2.0256410256410255, + "grad_norm": 0.03690061345696449, + "learning_rate": 1e-06, + "loss": 0.0227, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3370.0, + "completions/mean_length": 1423.083984375, + "completions/mean_terminated_length": 1364.413818359375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.1979808807373047, + "epoch": 2.02724358974359, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.024405112490057945, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 900394908.0, + "reward": 0.3067031800746918, + "reward_std": 0.04862578958272934, + "rewards/progression_diversity/mean": -9.318315278505906e-05, + "rewards/progression_diversity/std": 0.001644739182665944, + "rewards/symbolic_reward_accuracy/mean": 0.14453125, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.7345865964889526, + "rewards/symbolic_reward_partial_score/std": 0.20190957188606262, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0397248268127441, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.6828402876853943, + "step": 1265 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.19661108404397964, + "epoch": 2.0288461538461537, + "grad_norm": 0.013450069352984428, + "learning_rate": 1e-06, + "loss": 0.054, + "step": 1266 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.19856248795986176, + "epoch": 2.030448717948718, + "grad_norm": 0.017992835491895676, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 1267 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.19879474490880966, + "epoch": 2.032051282051282, + "grad_norm": 0.019288472831249237, + "learning_rate": 1e-06, + "loss": -0.0147, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9273.0, + "completions/mean_length": 1691.78515625, + "completions/mean_terminated_length": 1546.8914794921875, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.19042234122753143, + "epoch": 2.0336538461538463, + "frac_reward_zero_std": 0.4375, + "grad_norm": 47.59299087524414, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 902148702.0, + "reward": 0.3654331862926483, + "reward_std": 0.061046987771987915, + "rewards/progression_diversity/mean": -0.0011158745037391782, + "rewards/progression_diversity/std": 0.014414801262319088, + "rewards/symbolic_reward_accuracy/mean": 0.232421875, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.7565592527389526, + "rewards/symbolic_reward_partial_score/std": 0.20248234272003174, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0370126962661743, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 1.0821824073791504, + "step": 1269 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.18952977657318115, + "epoch": 2.03525641025641, + "grad_norm": 0.0266401544213295, + "learning_rate": 1e-06, + "loss": 0.1359, + "step": 1270 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.18822447210550308, + "epoch": 2.0368589743589745, + "grad_norm": 0.027362341061234474, + "learning_rate": 1e-06, + "loss": 0.0679, + "step": 1271 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.18995289504528046, + "epoch": 2.0384615384615383, + "grad_norm": 0.014087039045989513, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3306.0, + "completions/mean_length": 1566.1796875, + "completions/mean_terminated_length": 1420.04736328125, + "completions/min_length": 428.0, + "completions/min_terminated_length": 428.0, + "entropy": 0.19420960545539856, + "epoch": 2.0400641025641026, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.01688203774392605, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 903831514.0, + "reward": 0.36713409423828125, + "reward_std": 0.03999203443527222, + "rewards/progression_diversity/mean": -0.00046001013834029436, + "rewards/progression_diversity/std": 0.004506079014390707, + "rewards/symbolic_reward_accuracy/mean": 0.240234375, + "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, + "rewards/symbolic_reward_partial_score/mean": 0.74658203125, + "rewards/symbolic_reward_partial_score/std": 0.20671479403972626, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0370961427688599, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.4572553634643555, + "step": 1273 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.1954876109957695, + "epoch": 2.0416666666666665, + "grad_norm": 0.018446512520313263, + "learning_rate": 1e-06, + "loss": 0.0048, + "step": 1274 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.1945226937532425, + "epoch": 2.043269230769231, + "grad_norm": 324.0447082519531, + "learning_rate": 1e-06, + "loss": 0.0717, + "step": 1275 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.19547558575868607, + "epoch": 2.0448717948717947, + "grad_norm": 755.1400756835938, + "learning_rate": 1e-06, + "loss": 0.0474, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3276.0, + "completions/mean_length": 1617.3359375, + "completions/mean_terminated_length": 1353.1212158203125, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.19297361373901367, + "epoch": 2.046474358974359, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.018968136981129646, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 905556918.0, + "reward": 0.2657776474952698, + "reward_std": 0.0448925644159317, + "rewards/progression_diversity/mean": -0.00036140886368229985, + "rewards/progression_diversity/std": 0.004768828861415386, + "rewards/symbolic_reward_accuracy/mean": 0.115234375, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.6613280773162842, + "rewards/symbolic_reward_partial_score/std": 0.21297021210193634, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0336802005767822, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 2.5898587703704834, + "step": 1277 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.18827137351036072, + "epoch": 2.048076923076923, + "grad_norm": 0.011503035202622414, + "learning_rate": 1e-06, + "loss": 0.0706, + "step": 1278 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.19312509894371033, + "epoch": 2.0496794871794872, + "grad_norm": 0.009355909191071987, + "learning_rate": 1e-06, + "loss": 0.0425, + "step": 1279 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.18875424563884735, + "epoch": 2.051282051282051, + "grad_norm": 0.016550183296203613, + "learning_rate": 1e-06, + "loss": 0.0553, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3837.0, + "completions/mean_length": 1660.9140625, + "completions/mean_terminated_length": 1456.831787109375, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 0.19483474642038345, + "epoch": 2.0528846153846154, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.02122480981051922, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 907271450.0, + "reward": 0.33715343475341797, + "reward_std": 0.034450314939022064, + "rewards/progression_diversity/mean": -0.00047867521061562, + "rewards/progression_diversity/std": 0.004741494078189135, + "rewards/symbolic_reward_accuracy/mean": 0.216796875, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.69482421875, + "rewards/symbolic_reward_partial_score/std": 0.21679222583770752, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0364654064178467, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.3800773620605469, + "step": 1281 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.1928672194480896, + "epoch": 2.0544871794871793, + "grad_norm": 1.5037897825241089, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 1282 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.1988951489329338, + "epoch": 2.0560897435897436, + "grad_norm": 0.024000803008675575, + "learning_rate": 1e-06, + "loss": 0.0196, + "step": 1283 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.19203483313322067, + "epoch": 2.0576923076923075, + "grad_norm": 0.019834544509649277, + "learning_rate": 1e-06, + "loss": 0.0836, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5436.0, + "completions/mean_length": 1464.345703125, + "completions/mean_terminated_length": 1405.8372802734375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 0.20045289397239685, + "epoch": 2.059294871794872, + "frac_reward_zero_std": 0.40625, + "grad_norm": 28.706775665283203, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 908900203.0, + "reward": 0.30004429817199707, + "reward_std": 0.05181866139173508, + "rewards/progression_diversity/mean": -0.0009444843744859099, + "rewards/progression_diversity/std": 0.011486358940601349, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.6928874254226685, + "rewards/symbolic_reward_partial_score/std": 0.2050057202577591, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0416698455810547, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.41028910875320435, + "step": 1285 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2051420360803604, + "epoch": 2.0608974358974357, + "grad_norm": 0.01245288085192442, + "learning_rate": 1e-06, + "loss": -0.0077, + "step": 1286 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2012268602848053, + "epoch": 2.0625, + "grad_norm": 0.025453390553593636, + "learning_rate": 1e-06, + "loss": 0.0173, + "step": 1287 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.20480038225650787, + "epoch": 2.064102564102564, + "grad_norm": 0.025336798280477524, + "learning_rate": 1e-06, + "loss": 0.0103, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3935.0, + "completions/mean_length": 1418.521484375, + "completions/mean_terminated_length": 1389.23486328125, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.2055332362651825, + "epoch": 2.065705128205128, + "frac_reward_zero_std": 0.5, + "grad_norm": 157.3429412841797, + "learning_rate": 1e-06, + "loss": 0.0238, + "num_tokens": 910496934.0, + "reward": 0.26742783188819885, + "reward_std": 0.019551731646060944, + "rewards/progression_diversity/mean": -0.0008687502122484148, + "rewards/progression_diversity/std": 0.010150086134672165, + "rewards/symbolic_reward_accuracy/mean": 0.09375, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.7046061158180237, + "rewards/symbolic_reward_partial_score/std": 0.19600564241409302, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0424489974975586, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.4152454733848572, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2030770629644394, + "epoch": 2.0673076923076925, + "grad_norm": 0.015944253653287888, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.20718741416931152, + "epoch": 2.0689102564102564, + "grad_norm": 0.014883228577673435, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 1291 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.20744304358959198, + "epoch": 2.0705128205128207, + "grad_norm": 0.014635228551924229, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3403.0, + "completions/mean_length": 1773.353515625, + "completions/mean_terminated_length": 1600.1048583984375, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.18955910950899124, + "epoch": 2.0721153846153846, + "frac_reward_zero_std": 0.375, + "grad_norm": 1073.1669921875, + "learning_rate": 1e-06, + "loss": 0.058, + "num_tokens": 912368443.0, + "reward": 0.2491791993379593, + "reward_std": 0.03544043377041817, + "rewards/progression_diversity/mean": -0.0005382450763136148, + "rewards/progression_diversity/std": 0.008408893831074238, + "rewards/symbolic_reward_accuracy/mean": 0.08203125, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.670458972454071, + "rewards/symbolic_reward_partial_score/std": 0.19899418950080872, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.036078691482544, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.4867057800292969, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.19430772960186005, + "epoch": 2.073717948717949, + "grad_norm": 0.013100269250571728, + "learning_rate": 1e-06, + "loss": -0.0115, + "step": 1294 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.19107140600681305, + "epoch": 2.0753205128205128, + "grad_norm": 0.012087260372936726, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 1295 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.1892467811703682, + "epoch": 2.076923076923077, + "grad_norm": 0.01232385728508234, + "learning_rate": 1e-06, + "loss": 0.0455, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3224.0, + "completions/mean_length": 1561.599609375, + "completions/mean_terminated_length": 1444.8878173828125, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.1971185952425003, + "epoch": 2.078525641025641, + "frac_reward_zero_std": 0.34375, + "grad_norm": 33.13689422607422, + "learning_rate": 1e-06, + "loss": 0.0168, + "num_tokens": 914118782.0, + "reward": 0.27982568740844727, + "reward_std": 0.04554954543709755, + "rewards/progression_diversity/mean": -0.00034400858567096293, + "rewards/progression_diversity/std": 0.0038654166273772717, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.6931803226470947, + "rewards/symbolic_reward_partial_score/std": 0.18553432822227478, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.037705659866333, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 732.0, + "sampling/sampling_logp_difference/mean": 1.3978204727172852, + "step": 1297 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.1944933459162712, + "epoch": 2.0801282051282053, + "grad_norm": 0.015022831037640572, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 1298 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.19650224596261978, + "epoch": 2.081730769230769, + "grad_norm": 0.022843046113848686, + "learning_rate": 1e-06, + "loss": -0.0049, + "step": 1299 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.19136834889650345, + "epoch": 2.0833333333333335, + "grad_norm": 0.02248123101890087, + "learning_rate": 1e-06, + "loss": 0.0554, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3264.0, + "completions/mean_length": 1438.8671875, + "completions/mean_terminated_length": 1409.620361328125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.2071860283613205, + "epoch": 2.0849358974358974, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.022805871441960335, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 915655274.0, + "reward": 0.4092583954334259, + "reward_std": 0.04942026361823082, + "rewards/progression_diversity/mean": -0.0004308145144023001, + "rewards/progression_diversity/std": 0.00391729548573494, + "rewards/symbolic_reward_accuracy/mean": 0.302734375, + "rewards/symbolic_reward_accuracy/std": 0.45989060401916504, + "rewards/symbolic_reward_partial_score/mean": 0.7593912482261658, + "rewards/symbolic_reward_partial_score/std": 0.2201661616563797, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0432233810424805, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.34774258732795715, + "step": 1301 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.20583676546812057, + "epoch": 2.0865384615384617, + "grad_norm": 0.010654095560312271, + "learning_rate": 1e-06, + "loss": -0.0036, + "step": 1302 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.21007215976715088, + "epoch": 2.0881410256410255, + "grad_norm": 0.018026195466518402, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 1303 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.20896370708942413, + "epoch": 2.08974358974359, + "grad_norm": 0.018298866227269173, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2887.0, + "completions/mean_length": 1537.986328125, + "completions/mean_terminated_length": 1450.4853515625, + "completions/min_length": 502.0, + "completions/min_terminated_length": 502.0, + "entropy": 0.19623306393623352, + "epoch": 2.0913461538461537, + "frac_reward_zero_std": 0.25, + "grad_norm": 455.7239685058594, + "learning_rate": 1e-06, + "loss": 0.0324, + "num_tokens": 917384147.0, + "reward": 0.27777591347694397, + "reward_std": 0.05723512917757034, + "rewards/progression_diversity/mean": -0.00024088792270049453, + "rewards/progression_diversity/std": 0.0034442872274667025, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.6778808832168579, + "rewards/symbolic_reward_partial_score/std": 0.21165545284748077, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.040327787399292, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.7170519232749939, + "step": 1305 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.20050545781850815, + "epoch": 2.092948717948718, + "grad_norm": 0.024372782558202744, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 1306 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.20225122570991516, + "epoch": 2.094551282051282, + "grad_norm": 0.017451845109462738, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 1307 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.19906309247016907, + "epoch": 2.0961538461538463, + "grad_norm": 0.015806974843144417, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3072.0, + "completions/max_terminated_length": 3072.0, + "completions/mean_length": 1569.970703125, + "completions/mean_terminated_length": 1569.970703125, + "completions/min_length": 474.0, + "completions/min_terminated_length": 474.0, + "entropy": 0.2078276202082634, + "epoch": 2.09775641025641, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.030513443052768707, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 918996132.0, + "reward": 0.3342166543006897, + "reward_std": 0.05483569577336311, + "rewards/progression_diversity/mean": -0.00021007427130825818, + "rewards/progression_diversity/std": 0.001884151715785265, + "rewards/symbolic_reward_accuracy/mean": 0.203125, + "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, + "rewards/symbolic_reward_partial_score/mean": 0.7078125476837158, + "rewards/symbolic_reward_partial_score/std": 0.2104804515838623, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0436408519744873, + "sampling/importance_sampling_ratio/min": 1.435761191714846e-06, + "sampling/sampling_logp_difference/max": 13.453815460205078, + "sampling/sampling_logp_difference/mean": 0.09233014285564423, + "step": 1309 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.20950071513652802, + "epoch": 2.0993589743589745, + "grad_norm": 0.01937035657465458, + "learning_rate": 1e-06, + "loss": -0.0061, + "step": 1310 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.21072106808423996, + "epoch": 2.1009615384615383, + "grad_norm": 0.019583554938435555, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 1311 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.20755978673696518, + "epoch": 2.1025641025641026, + "grad_norm": 0.017628004774451256, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2935.0, + "completions/max_terminated_length": 2935.0, + "completions/mean_length": 1462.236328125, + "completions/mean_terminated_length": 1462.236328125, + "completions/min_length": 465.0, + "completions/min_terminated_length": 465.0, + "entropy": 0.20946191251277924, + "epoch": 2.1041666666666665, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.016548819839954376, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 920573229.0, + "reward": 0.3093579113483429, + "reward_std": 0.030774272978305817, + "rewards/progression_diversity/mean": -0.0002441601827740669, + "rewards/progression_diversity/std": 0.0027843969874083996, + "rewards/symbolic_reward_accuracy/mean": 0.146484375, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.7382323741912842, + "rewards/symbolic_reward_partial_score/std": 0.1788843870162964, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.044257640838623, + "sampling/importance_sampling_ratio/min": 0.00013327680062502623, + "sampling/sampling_logp_difference/max": 8.92308235168457, + "sampling/sampling_logp_difference/mean": 0.09280422329902649, + "step": 1313 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2082810178399086, + "epoch": 2.105769230769231, + "grad_norm": 0.018022574484348297, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 1314 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.20982997119426727, + "epoch": 2.1073717948717947, + "grad_norm": 0.01984243281185627, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 1315 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.21134591847658157, + "epoch": 2.108974358974359, + "grad_norm": 0.025971846655011177, + "learning_rate": 1e-06, + "loss": 0.0064, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3383.0, + "completions/mean_length": 1503.8203125, + "completions/mean_terminated_length": 1474.7005615234375, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "entropy": 0.20815780013799667, + "epoch": 2.110576923076923, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.027938535436987877, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 922209697.0, + "reward": 0.3024898171424866, + "reward_std": 0.08157630264759064, + "rewards/progression_diversity/mean": -0.0005339247290976346, + "rewards/progression_diversity/std": 0.009711179882287979, + "rewards/symbolic_reward_accuracy/mean": 0.162109375, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.6847493648529053, + "rewards/symbolic_reward_partial_score/std": 0.21822915971279144, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0429041385650635, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.4195671081542969, + "step": 1317 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21150559186935425, + "epoch": 2.1121794871794872, + "grad_norm": 0.0272557120770216, + "learning_rate": 1e-06, + "loss": -0.0185, + "step": 1318 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.20635531842708588, + "epoch": 2.113782051282051, + "grad_norm": 0.0219330545514822, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 1319 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.20883511006832123, + "epoch": 2.1153846153846154, + "grad_norm": 344.6760559082031, + "learning_rate": 1e-06, + "loss": 0.0317, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3218.0, + "completions/mean_length": 1496.935546875, + "completions/mean_terminated_length": 1467.8023681640625, + "completions/min_length": 487.0, + "completions/min_terminated_length": 487.0, + "entropy": 0.20407634973526, + "epoch": 2.1169871794871793, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.02108251117169857, + "learning_rate": 1e-06, + "loss": 0.0286, + "num_tokens": 923895312.0, + "reward": 0.36836761236190796, + "reward_std": 0.025770656764507294, + "rewards/progression_diversity/mean": -0.0001522178645245731, + "rewards/progression_diversity/std": 0.0016012955456972122, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.7357096076011658, + "rewards/symbolic_reward_partial_score/std": 0.22949159145355225, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0432062149047852, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 728.0, + "sampling/sampling_logp_difference/mean": 0.1866808831691742, + "step": 1321 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.20261122286319733, + "epoch": 2.1185897435897436, + "grad_norm": 0.024469273164868355, + "learning_rate": 1e-06, + "loss": 0.006, + "step": 1322 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.20310993492603302, + "epoch": 2.1201923076923075, + "grad_norm": 0.023724382743239403, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 1323 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.20938430726528168, + "epoch": 2.121794871794872, + "grad_norm": 0.012969693168997765, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2685.0, + "completions/max_terminated_length": 2685.0, + "completions/mean_length": 1410.677734375, + "completions/mean_terminated_length": 1410.677734375, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.2097427099943161, + "epoch": 2.123397435897436, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.01685580424964428, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 925466827.0, + "reward": 0.38571590185165405, + "reward_std": 0.03916021063923836, + "rewards/progression_diversity/mean": -0.00018581100448500365, + "rewards/progression_diversity/std": 0.002618141006678343, + "rewards/symbolic_reward_accuracy/mean": 0.255859375, + "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, + "rewards/symbolic_reward_partial_score/mean": 0.7740071415901184, + "rewards/symbolic_reward_partial_score/std": 0.18180802464485168, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0450756549835205, + "sampling/importance_sampling_ratio/min": 1.69496820490167e-06, + "sampling/sampling_logp_difference/max": 13.287846565246582, + "sampling/sampling_logp_difference/mean": 0.0947427749633789, + "step": 1325 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2110649198293686, + "epoch": 2.125, + "grad_norm": 0.019374966621398926, + "learning_rate": 1e-06, + "loss": 0.0141, + "step": 1326 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.20897484570741653, + "epoch": 2.126602564102564, + "grad_norm": 0.011842702515423298, + "learning_rate": 1e-06, + "loss": -0.0107, + "step": 1327 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.21269075572490692, + "epoch": 2.128205128205128, + "grad_norm": 0.01617637649178505, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2945.0, + "completions/mean_length": 1554.6484375, + "completions/mean_terminated_length": 1467.24560546875, + "completions/min_length": 448.0, + "completions/min_terminated_length": 448.0, + "entropy": 0.2093176394701004, + "epoch": 2.1298076923076925, + "frac_reward_zero_std": 0.59375, + "grad_norm": 3.476539134979248, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 927113287.0, + "reward": 0.372197687625885, + "reward_std": 0.024160441011190414, + "rewards/progression_diversity/mean": -0.0009393933578394353, + "rewards/progression_diversity/std": 0.014015908353030682, + "rewards/symbolic_reward_accuracy/mean": 0.244140625, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.7543619871139526, + "rewards/symbolic_reward_partial_score/std": 0.20396019518375397, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041762351989746, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.016873836517334, + "step": 1329 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.21161337196826935, + "epoch": 2.1314102564102564, + "grad_norm": 0.009917390532791615, + "learning_rate": 1e-06, + "loss": 0.025, + "step": 1330 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.21297961473464966, + "epoch": 2.1330128205128207, + "grad_norm": 0.008365034125745296, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 1331 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.21263594925403595, + "epoch": 2.1346153846153846, + "grad_norm": 0.020000530406832695, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2631.0, + "completions/max_terminated_length": 2631.0, + "completions/mean_length": 1392.521484375, + "completions/mean_terminated_length": 1392.521484375, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 0.21844053268432617, + "epoch": 2.136217948717949, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02576032653450966, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 928662738.0, + "reward": 0.39745038747787476, + "reward_std": 0.04422683268785477, + "rewards/progression_diversity/mean": -8.176280243787915e-05, + "rewards/progression_diversity/std": 0.001850081025622785, + "rewards/symbolic_reward_accuracy/mean": 0.2890625, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.7467122673988342, + "rewards/symbolic_reward_partial_score/std": 0.21178719401359558, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0465729236602783, + "sampling/importance_sampling_ratio/min": 9.539062739349902e-05, + "sampling/sampling_logp_difference/max": 9.257530212402344, + "sampling/sampling_logp_difference/mean": 0.09742757678031921, + "step": 1333 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.22133174538612366, + "epoch": 2.1378205128205128, + "grad_norm": 0.012168043293058872, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 1334 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.22107722610235214, + "epoch": 2.139423076923077, + "grad_norm": 0.010279431007802486, + "learning_rate": 1e-06, + "loss": -0.0065, + "step": 1335 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.21761613339185715, + "epoch": 2.141025641025641, + "grad_norm": 0.008288303390145302, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2819.0, + "completions/mean_length": 1633.904296875, + "completions/mean_terminated_length": 1605.0391845703125, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.20767099410295486, + "epoch": 2.1426282051282053, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.021908177062869072, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 930332929.0, + "reward": 0.27803462743759155, + "reward_std": 0.053543590009212494, + "rewards/progression_diversity/mean": -0.0002496470115147531, + "rewards/progression_diversity/std": 0.0035664818715304136, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.661816418170929, + "rewards/symbolic_reward_partial_score/std": 0.1937265247106552, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0444786548614502, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.3441740572452545, + "step": 1337 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.20627369731664658, + "epoch": 2.144230769230769, + "grad_norm": 0.026881849393248558, + "learning_rate": 1e-06, + "loss": 0.0346, + "step": 1338 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21029096841812134, + "epoch": 2.1458333333333335, + "grad_norm": 0.015624100342392921, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 1339 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.21201331168413162, + "epoch": 2.1474358974358974, + "grad_norm": 0.023685337975621223, + "learning_rate": 1e-06, + "loss": -0.0102, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2974.0, + "completions/max_terminated_length": 2974.0, + "completions/mean_length": 1587.693359375, + "completions/mean_terminated_length": 1587.693359375, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.20493042469024658, + "epoch": 2.1490384615384617, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.022984595969319344, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 931986948.0, + "reward": 0.4041173458099365, + "reward_std": 0.039462070912122726, + "rewards/progression_diversity/mean": -0.0008660133462399244, + "rewards/progression_diversity/std": 0.006552197504788637, + "rewards/symbolic_reward_accuracy/mean": 0.296875, + "rewards/symbolic_reward_accuracy/std": 0.45732781291007996, + "rewards/symbolic_reward_partial_score/mean": 0.7533365488052368, + "rewards/symbolic_reward_partial_score/std": 0.21289843320846558, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0439233779907227, + "sampling/importance_sampling_ratio/min": 0.0010176009964197874, + "sampling/sampling_logp_difference/max": 6.890307426452637, + "sampling/sampling_logp_difference/mean": 0.0925874412059784, + "step": 1341 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.20170185714960098, + "epoch": 2.1506410256410255, + "grad_norm": 0.022239655256271362, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 1342 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2027420625090599, + "epoch": 2.15224358974359, + "grad_norm": 0.01715942844748497, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 1343 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.20686108618974686, + "epoch": 2.1538461538461537, + "grad_norm": 0.010147986933588982, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3213.0, + "completions/max_terminated_length": 3213.0, + "completions/mean_length": 1698.078125, + "completions/mean_terminated_length": 1698.078125, + "completions/min_length": 559.0, + "completions/min_terminated_length": 559.0, + "entropy": 0.2128048539161682, + "epoch": 2.155448717948718, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.016743527725338936, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 933690460.0, + "reward": 0.3048563003540039, + "reward_std": 0.04834011197090149, + "rewards/progression_diversity/mean": -0.0007014497532509267, + "rewards/progression_diversity/std": 0.00812255684286356, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.7115234136581421, + "rewards/symbolic_reward_partial_score/std": 0.18424317240715027, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0451080799102783, + "sampling/importance_sampling_ratio/min": 6.933895201655105e-05, + "sampling/sampling_logp_difference/max": 9.57650375366211, + "sampling/sampling_logp_difference/mean": 0.09451892971992493, + "step": 1345 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.21316298097372055, + "epoch": 2.157051282051282, + "grad_norm": 0.013251153752207756, + "learning_rate": 1e-06, + "loss": 0.0005, + "step": 1346 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.20967798680067062, + "epoch": 2.1586538461538463, + "grad_norm": 0.015155055560171604, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 1347 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2121565341949463, + "epoch": 2.16025641025641, + "grad_norm": 0.024950312450528145, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3127.0, + "completions/mean_length": 1755.244140625, + "completions/mean_terminated_length": 1697.8765869140625, + "completions/min_length": 394.0, + "completions/min_terminated_length": 394.0, + "entropy": 0.20723742246627808, + "epoch": 2.1618589743589745, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02118254266679287, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 935479065.0, + "reward": 0.4168354868888855, + "reward_std": 0.05675492808222771, + "rewards/progression_diversity/mean": -0.000535505183506757, + "rewards/progression_diversity/std": 0.0042114765383303165, + "rewards/symbolic_reward_accuracy/mean": 0.32421875, + "rewards/symbolic_reward_accuracy/std": 0.4685399830341339, + "rewards/symbolic_reward_partial_score/mean": 0.7423340082168579, + "rewards/symbolic_reward_partial_score/std": 0.23071099817752838, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0427265167236328, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.5817426443099976, + "step": 1349 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.20122719556093216, + "epoch": 2.1634615384615383, + "grad_norm": 0.01896592602133751, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 1350 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.20486315339803696, + "epoch": 2.1650641025641026, + "grad_norm": 0.01839040406048298, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 1351 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.20524092018604279, + "epoch": 2.1666666666666665, + "grad_norm": 0.013213125988841057, + "learning_rate": 1e-06, + "loss": -0.0134, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3402.0, + "completions/mean_length": 1791.140625, + "completions/mean_terminated_length": 1705.1317138671875, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.1993129849433899, + "epoch": 2.168269230769231, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.018554260954260826, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 937293713.0, + "reward": 0.3082357347011566, + "reward_std": 0.05308583378791809, + "rewards/progression_diversity/mean": -0.0016233250498771667, + "rewards/progression_diversity/std": 0.010440926998853683, + "rewards/symbolic_reward_accuracy/mean": 0.1640625, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.70068359375, + "rewards/symbolic_reward_partial_score/std": 0.1991189867258072, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0414732694625854, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.7726681232452393, + "step": 1353 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2003878429532051, + "epoch": 2.1698717948717947, + "grad_norm": 0.024082843214273453, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 1354 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.20279623568058014, + "epoch": 2.171474358974359, + "grad_norm": 0.02481573261320591, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 1355 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.20432738959789276, + "epoch": 2.173076923076923, + "grad_norm": 0.023944402113556862, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4818.0, + "completions/mean_length": 1645.9921875, + "completions/mean_terminated_length": 1617.150634765625, + "completions/min_length": 515.0, + "completions/min_terminated_length": 515.0, + "entropy": 0.20042243599891663, + "epoch": 2.1746794871794872, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.025058995932340622, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 939053501.0, + "reward": 0.3049207925796509, + "reward_std": 0.029460538178682327, + "rewards/progression_diversity/mean": -0.0020646725315600634, + "rewards/progression_diversity/std": 0.010834318585693836, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.7085286378860474, + "rewards/symbolic_reward_partial_score/std": 0.19462068378925323, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0425053834915161, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.37059780955314636, + "step": 1357 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.19977743178606033, + "epoch": 2.176282051282051, + "grad_norm": 0.01753915846347809, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 1358 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.19901012629270554, + "epoch": 2.1778846153846154, + "grad_norm": 0.020039677619934082, + "learning_rate": 1e-06, + "loss": 0.0348, + "step": 1359 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.1974909007549286, + "epoch": 2.1794871794871793, + "grad_norm": 0.016383672133088112, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3294.0, + "completions/mean_length": 1614.376953125, + "completions/mean_terminated_length": 1556.4569091796875, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.2029455080628395, + "epoch": 2.1810897435897436, + "frac_reward_zero_std": 0.125, + "grad_norm": 80.58993530273438, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 940647406.0, + "reward": 0.41535690426826477, + "reward_std": 0.07679130136966705, + "rewards/progression_diversity/mean": -0.0019082010257989168, + "rewards/progression_diversity/std": 0.01008455641567707, + "rewards/symbolic_reward_accuracy/mean": 0.310546875, + "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, + "rewards/symbolic_reward_partial_score/mean": 0.7647949457168579, + "rewards/symbolic_reward_partial_score/std": 0.22581757605075836, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.042673110961914, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.7193973660469055, + "step": 1361 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.203001469373703, + "epoch": 2.1826923076923075, + "grad_norm": 0.021764498203992844, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 1362 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2056587189435959, + "epoch": 2.184294871794872, + "grad_norm": 0.020050769671797752, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 1363 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2047322392463684, + "epoch": 2.185897435897436, + "grad_norm": 0.02470891922712326, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3347.0, + "completions/mean_length": 1624.029296875, + "completions/mean_terminated_length": 1537.035400390625, + "completions/min_length": 432.0, + "completions/min_terminated_length": 432.0, + "entropy": 0.19499259442090988, + "epoch": 2.1875, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.025656770914793015, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 942358845.0, + "reward": 0.33442234992980957, + "reward_std": 0.039018601179122925, + "rewards/progression_diversity/mean": -0.0016129279974848032, + "rewards/progression_diversity/std": 0.011494509875774384, + "rewards/symbolic_reward_accuracy/mean": 0.1953125, + "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, + "rewards/symbolic_reward_partial_score/mean": 0.7261229753494263, + "rewards/symbolic_reward_partial_score/std": 0.2054896205663681, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0397260189056396, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.8769243955612183, + "step": 1365 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.19767170399427414, + "epoch": 2.189102564102564, + "grad_norm": 0.015550960786640644, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 1366 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.19075892865657806, + "epoch": 2.190705128205128, + "grad_norm": 0.024761386215686798, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 1367 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.1886235773563385, + "epoch": 2.1923076923076925, + "grad_norm": 0.015679223462939262, + "learning_rate": 1e-06, + "loss": 0.0347, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3384.0, + "completions/mean_length": 1550.146484375, + "completions/mean_terminated_length": 1521.117431640625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.1886545717716217, + "epoch": 2.1939102564102564, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.02213294804096222, + "learning_rate": 1e-06, + "loss": 0.025, + "num_tokens": 944050936.0, + "reward": 0.3354012668132782, + "reward_std": 0.0484355203807354, + "rewards/progression_diversity/mean": -0.0004021337954327464, + "rewards/progression_diversity/std": 0.0033471970818936825, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.6967936158180237, + "rewards/symbolic_reward_partial_score/std": 0.24216388165950775, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0405962467193604, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.31618547439575195, + "step": 1369 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.19220811873674393, + "epoch": 2.1955128205128207, + "grad_norm": 0.017105696722865105, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 1370 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.18837008625268936, + "epoch": 2.1971153846153846, + "grad_norm": 0.020541273057460785, + "learning_rate": 1e-06, + "loss": -0.0059, + "step": 1371 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.19123337417840958, + "epoch": 2.198717948717949, + "grad_norm": 0.016050660982728004, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3055.0, + "completions/mean_length": 1501.89453125, + "completions/mean_terminated_length": 1443.533447265625, + "completions/min_length": 529.0, + "completions/min_terminated_length": 529.0, + "entropy": 0.1963193044066429, + "epoch": 2.2003205128205128, + "frac_reward_zero_std": 0.1875, + "grad_norm": 839.10888671875, + "learning_rate": 1e-06, + "loss": 0.0385, + "num_tokens": 945680274.0, + "reward": 0.3143922984600067, + "reward_std": 0.04647096246480942, + "rewards/progression_diversity/mean": -0.0016889558173716068, + "rewards/progression_diversity/std": 0.009619291871786118, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.6782389283180237, + "rewards/symbolic_reward_partial_score/std": 0.20476947724819183, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0405044555664062, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.8936862349510193, + "step": 1373 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.19663988798856735, + "epoch": 2.201923076923077, + "grad_norm": 0.023362474516034126, + "learning_rate": 1e-06, + "loss": 0.0048, + "step": 1374 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.19550593197345734, + "epoch": 2.203525641025641, + "grad_norm": 0.01925286464393139, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 1375 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.198850616812706, + "epoch": 2.2051282051282053, + "grad_norm": 0.012360334396362305, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10325.0, + "completions/mean_length": 1557.294921875, + "completions/mean_terminated_length": 1499.151123046875, + "completions/min_length": 446.0, + "completions/min_terminated_length": 446.0, + "entropy": 0.1966118887066841, + "epoch": 2.206730769230769, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.030109280720353127, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 947367161.0, + "reward": 0.2886817753314972, + "reward_std": 0.03847378119826317, + "rewards/progression_diversity/mean": -0.0009640345815569162, + "rewards/progression_diversity/std": 0.006367819383740425, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.6979818344116211, + "rewards/symbolic_reward_partial_score/std": 0.1839800328016281, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0392873287200928, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.0182666778564453, + "step": 1377 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.1928621083498001, + "epoch": 2.2083333333333335, + "grad_norm": 0.01684403605759144, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 1378 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.19299018383026123, + "epoch": 2.2099358974358974, + "grad_norm": 0.013669331558048725, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 1379 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.19235268235206604, + "epoch": 2.2115384615384617, + "grad_norm": 0.013778350315988064, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3313.0, + "completions/mean_length": 1556.845703125, + "completions/mean_terminated_length": 1527.8297119140625, + "completions/min_length": 408.0, + "completions/min_terminated_length": 408.0, + "entropy": 0.19688106328248978, + "epoch": 2.2131410256410255, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.03251493349671364, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 949020410.0, + "reward": 0.26278871297836304, + "reward_std": 0.05675513297319412, + "rewards/progression_diversity/mean": -0.00042738509364426136, + "rewards/progression_diversity/std": 0.004007916897535324, + "rewards/symbolic_reward_accuracy/mean": 0.111328125, + "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, + "rewards/symbolic_reward_partial_score/mean": 0.6539713144302368, + "rewards/symbolic_reward_partial_score/std": 0.2113257795572281, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0415072441101074, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.9642093777656555, + "step": 1381 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2019263282418251, + "epoch": 2.21474358974359, + "grad_norm": 0.013695978559553623, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 1382 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.20185761898756027, + "epoch": 2.2163461538461537, + "grad_norm": 0.016754373908042908, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 1383 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.20480705797672272, + "epoch": 2.217948717948718, + "grad_norm": 0.013411487452685833, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3598.0, + "completions/mean_length": 1521.16015625, + "completions/mean_terminated_length": 1433.5599365234375, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "entropy": 0.21776793152093887, + "epoch": 2.219551282051282, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.022128723561763763, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 950676124.0, + "reward": 0.3249192237854004, + "reward_std": 0.05313926190137863, + "rewards/progression_diversity/mean": -0.0017315060831606388, + "rewards/progression_diversity/std": 0.009827325120568275, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.7178873419761658, + "rewards/symbolic_reward_partial_score/std": 0.20977161824703217, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0407921075820923, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 2.0792016983032227, + "step": 1385 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.20703934133052826, + "epoch": 2.2211538461538463, + "grad_norm": 0.028047246858477592, + "learning_rate": 1e-06, + "loss": 0.0584, + "step": 1386 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.21588321775197983, + "epoch": 2.22275641025641, + "grad_norm": 0.017534522339701653, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1387 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2086542472243309, + "epoch": 2.2243589743589745, + "grad_norm": 0.015552827157080173, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3854.0, + "completions/mean_length": 1460.521484375, + "completions/mean_terminated_length": 1401.9981689453125, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "entropy": 0.2114865928888321, + "epoch": 2.2259615384615383, + "frac_reward_zero_std": 0.34375, + "grad_norm": 393.90008544921875, + "learning_rate": 1e-06, + "loss": 0.0184, + "num_tokens": 952301287.0, + "reward": 0.3778611421585083, + "reward_std": 0.04598058760166168, + "rewards/progression_diversity/mean": -0.000995249254629016, + "rewards/progression_diversity/std": 0.006696970667690039, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.7686848640441895, + "rewards/symbolic_reward_partial_score/std": 0.1856938898563385, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0438477993011475, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.968491792678833, + "step": 1389 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22220055013895035, + "epoch": 2.2275641025641026, + "grad_norm": 0.007562046870589256, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1390 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.21346572041511536, + "epoch": 2.2291666666666665, + "grad_norm": 0.02975591830909252, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 1391 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.20686539262533188, + "epoch": 2.230769230769231, + "grad_norm": 0.023594049736857414, + "learning_rate": 1e-06, + "loss": 0.0378, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2763.0, + "completions/mean_length": 1543.576171875, + "completions/mean_terminated_length": 1456.108154296875, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "entropy": 0.21457184851169586, + "epoch": 2.2323717948717947, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.024636728689074516, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 953897950.0, + "reward": 0.3451923131942749, + "reward_std": 0.03799974173307419, + "rewards/progression_diversity/mean": -0.0012781170662492514, + "rewards/progression_diversity/std": 0.014912668615579605, + "rewards/symbolic_reward_accuracy/mean": 0.205078125, + "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, + "rewards/symbolic_reward_partial_score/mean": 0.7424805164337158, + "rewards/symbolic_reward_partial_score/std": 0.19341571629047394, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0439865589141846, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.7786634564399719, + "step": 1393 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.20991724729537964, + "epoch": 2.233974358974359, + "grad_norm": 0.018171893432736397, + "learning_rate": 1e-06, + "loss": 0.0823, + "step": 1394 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.21980993449687958, + "epoch": 2.235576923076923, + "grad_norm": 0.018304111436009407, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21467426419258118, + "epoch": 2.2371794871794872, + "grad_norm": 0.011014263145625591, + "learning_rate": 1e-06, + "loss": -0.0085, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11943.0, + "completions/mean_length": 1486.80859375, + "completions/mean_terminated_length": 1428.3883056640625, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "entropy": 0.20776310563087463, + "epoch": 2.238782051282051, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.04726417362689972, + "learning_rate": 1e-06, + "loss": 0.0351, + "num_tokens": 955540620.0, + "reward": 0.279166579246521, + "reward_std": 0.017867092043161392, + "rewards/progression_diversity/mean": -0.001313370536081493, + "rewards/progression_diversity/std": 0.011600782163441181, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.6819010972976685, + "rewards/symbolic_reward_partial_score/std": 0.2057180553674698, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0435209274291992, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.23488587141036987, + "step": 1397 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.20805874466896057, + "epoch": 2.2403846153846154, + "grad_norm": 0.020725863054394722, + "learning_rate": 1e-06, + "loss": -0.0034, + "step": 1398 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2029411420226097, + "epoch": 2.2419871794871793, + "grad_norm": 0.008639580570161343, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2050759717822075, + "epoch": 2.2435897435897436, + "grad_norm": 0.011195927858352661, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3068.0, + "completions/mean_length": 1376.677734375, + "completions/mean_terminated_length": 1347.3092041015625, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "entropy": 0.21754567325115204, + "epoch": 2.2451923076923075, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.01782279461622238, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 957101943.0, + "reward": 0.3711217939853668, + "reward_std": 0.0564521960914135, + "rewards/progression_diversity/mean": -0.0006168890977278352, + "rewards/progression_diversity/std": 0.005630532745271921, + "rewards/symbolic_reward_accuracy/mean": 0.271484375, + "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, + "rewards/symbolic_reward_partial_score/mean": 0.6947753429412842, + "rewards/symbolic_reward_partial_score/std": 0.22683373093605042, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0451213121414185, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.8511496782302856, + "step": 1401 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.21188554912805557, + "epoch": 2.246794871794872, + "grad_norm": 0.01593591831624508, + "learning_rate": 1e-06, + "loss": 0.0121, + "step": 1402 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21926379948854446, + "epoch": 2.248397435897436, + "grad_norm": 0.013704286888241768, + "learning_rate": 1e-06, + "loss": 0.0011, + "step": 1403 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.21662156283855438, + "epoch": 2.25, + "grad_norm": 0.009653382934629917, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3479.0, + "completions/mean_length": 1432.54296875, + "completions/mean_terminated_length": 1403.28369140625, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.21846728771924973, + "epoch": 2.251602564102564, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.02795334719121456, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 958673405.0, + "reward": 0.28600820899009705, + "reward_std": 0.04315223544836044, + "rewards/progression_diversity/mean": -0.0007437419844791293, + "rewards/progression_diversity/std": 0.008263318799436092, + "rewards/symbolic_reward_accuracy/mean": 0.130859375, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.6923177242279053, + "rewards/symbolic_reward_partial_score/std": 0.18848715722560883, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0477138757705688, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 716.0, + "sampling/sampling_logp_difference/mean": 0.16357702016830444, + "step": 1405 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.22100605070590973, + "epoch": 2.253205128205128, + "grad_norm": 0.02042258158326149, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1406 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2175752967596054, + "epoch": 2.2548076923076925, + "grad_norm": 0.013299060985445976, + "learning_rate": 1e-06, + "loss": 0.0054, + "step": 1407 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2246379554271698, + "epoch": 2.2564102564102564, + "grad_norm": 0.01301884651184082, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6158.0, + "completions/mean_length": 1540.216796875, + "completions/mean_terminated_length": 1482.0059814453125, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.22940433025360107, + "epoch": 2.2580128205128207, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.01926892064511776, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 960228028.0, + "reward": 0.3234546184539795, + "reward_std": 0.027436789125204086, + "rewards/progression_diversity/mean": -0.0012210736749693751, + "rewards/progression_diversity/std": 0.010683650150895119, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.7084310054779053, + "rewards/symbolic_reward_partial_score/std": 0.20990176498889923, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0487642288208008, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.48923975229263306, + "step": 1409 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.23073109239339828, + "epoch": 2.2596153846153846, + "grad_norm": 0.005647487938404083, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 1410 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2332421839237213, + "epoch": 2.261217948717949, + "grad_norm": 0.02655133046209812, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 1411 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22827783972024918, + "epoch": 2.2628205128205128, + "grad_norm": 0.013446721248328686, + "learning_rate": 1e-06, + "loss": 0.0165, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4043.0, + "completions/mean_length": 1543.96484375, + "completions/mean_terminated_length": 1456.4990234375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.22754886746406555, + "epoch": 2.264423076923077, + "frac_reward_zero_std": 0.375, + "grad_norm": 23.43297576904297, + "learning_rate": 1e-06, + "loss": 0.028, + "num_tokens": 961827418.0, + "reward": 0.3246934413909912, + "reward_std": 0.04636027663946152, + "rewards/progression_diversity/mean": -0.0023379838094115257, + "rewards/progression_diversity/std": 0.023444252088665962, + "rewards/symbolic_reward_accuracy/mean": 0.17578125, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.7321288585662842, + "rewards/symbolic_reward_partial_score/std": 0.18431484699249268, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0483700037002563, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 1.1377493143081665, + "step": 1413 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.23179607093334198, + "epoch": 2.266025641025641, + "grad_norm": 0.014759178273379803, + "learning_rate": 1e-06, + "loss": -0.0051, + "step": 1414 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22570616006851196, + "epoch": 2.2676282051282053, + "grad_norm": 0.022691620513796806, + "learning_rate": 1e-06, + "loss": 0.0214, + "step": 1415 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23608656972646713, + "epoch": 2.269230769230769, + "grad_norm": 0.010918180458247662, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4234.0, + "completions/max_terminated_length": 4234.0, + "completions/mean_length": 1428.330078125, + "completions/mean_terminated_length": 1428.330078125, + "completions/min_length": 361.0, + "completions/min_terminated_length": 361.0, + "entropy": 0.2318318784236908, + "epoch": 2.2708333333333335, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.02078605629503727, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 963460099.0, + "reward": 0.3764505386352539, + "reward_std": 0.07807396352291107, + "rewards/progression_diversity/mean": -0.0009436353575438261, + "rewards/progression_diversity/std": 0.011008831672370434, + "rewards/symbolic_reward_accuracy/mean": 0.27734375, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.7001790404319763, + "rewards/symbolic_reward_partial_score/std": 0.22067667543888092, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0520446300506592, + "sampling/importance_sampling_ratio/min": 0.0008081789710558951, + "sampling/sampling_logp_difference/max": 7.120727062225342, + "sampling/sampling_logp_difference/mean": 0.10327097773551941, + "step": 1417 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23364517837762833, + "epoch": 2.2724358974358974, + "grad_norm": 0.018402917310595512, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 1418 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2324867844581604, + "epoch": 2.2740384615384617, + "grad_norm": 0.015934674069285393, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 1419 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23535658419132233, + "epoch": 2.2756410256410255, + "grad_norm": 0.021948518231511116, + "learning_rate": 1e-06, + "loss": -0.0146, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4152.0, + "completions/mean_length": 1650.921875, + "completions/mean_terminated_length": 1593.1451416015625, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "entropy": 0.23526672273874283, + "epoch": 2.27724358974359, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.023312116041779518, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 965069179.0, + "reward": 0.34999796748161316, + "reward_std": 0.03697862848639488, + "rewards/progression_diversity/mean": -0.0016677755629643798, + "rewards/progression_diversity/std": 0.013735439628362656, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.7461426258087158, + "rewards/symbolic_reward_partial_score/std": 0.19722376763820648, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.04874849319458, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 1.2209948301315308, + "step": 1421 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.23266298323869705, + "epoch": 2.2788461538461537, + "grad_norm": 0.028419379144906998, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1422 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2373318374156952, + "epoch": 2.280448717948718, + "grad_norm": 684.584228515625, + "learning_rate": 1e-06, + "loss": 0.0291, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23147784173488617, + "epoch": 2.282051282051282, + "grad_norm": 0.015300055034458637, + "learning_rate": 1e-06, + "loss": 0.0354, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7387.0, + "completions/mean_length": 1647.84765625, + "completions/mean_terminated_length": 1590.0589599609375, + "completions/min_length": 459.0, + "completions/min_terminated_length": 459.0, + "entropy": 0.22117284685373306, + "epoch": 2.2836538461538463, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.02695753425359726, + "learning_rate": 1e-06, + "loss": 0.0424, + "num_tokens": 966809885.0, + "reward": 0.3317036032676697, + "reward_std": 0.04540080577135086, + "rewards/progression_diversity/mean": -0.0010283860610798001, + "rewards/progression_diversity/std": 0.014310967177152634, + "rewards/symbolic_reward_accuracy/mean": 0.197265625, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.7124837040901184, + "rewards/symbolic_reward_partial_score/std": 0.21933910250663757, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0476967096328735, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 1.099198818206787, + "step": 1425 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2237677425146103, + "epoch": 2.28525641025641, + "grad_norm": 0.013611048460006714, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 1426 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.22483280301094055, + "epoch": 2.2868589743589745, + "grad_norm": 0.02124941535294056, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 1427 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2248641401529312, + "epoch": 2.2884615384615383, + "grad_norm": 0.024975134059786797, + "learning_rate": 1e-06, + "loss": -0.01, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2827.0, + "completions/mean_length": 1540.505859375, + "completions/mean_terminated_length": 1482.296142578125, + "completions/min_length": 386.0, + "completions/min_terminated_length": 386.0, + "entropy": 0.23250573128461838, + "epoch": 2.2900641025641026, + "frac_reward_zero_std": 0.5625, + "grad_norm": 823.3074340820312, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 968396016.0, + "reward": 0.2985820770263672, + "reward_std": 0.015081477351486683, + "rewards/progression_diversity/mean": -0.0006824900628998876, + "rewards/progression_diversity/std": 0.005089160054922104, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.7465982437133789, + "rewards/symbolic_reward_partial_score/std": 0.16918420791625977, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0513556003570557, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.42589616775512695, + "step": 1429 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2318640798330307, + "epoch": 2.2916666666666665, + "grad_norm": 0.01384742185473442, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 1430 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2302088588476181, + "epoch": 2.293269230769231, + "grad_norm": 0.015527794137597084, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1431 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.23823175579309464, + "epoch": 2.2948717948717947, + "grad_norm": 40.3269157409668, + "learning_rate": 1e-06, + "loss": 0.0411, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4556.0, + "completions/mean_length": 1663.361328125, + "completions/mean_terminated_length": 1576.5992431640625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.21347655355930328, + "epoch": 2.296474358974359, + "frac_reward_zero_std": 0.21875, + "grad_norm": 78.3970947265625, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 970189689.0, + "reward": 0.3578779399394989, + "reward_std": 0.055181439965963364, + "rewards/progression_diversity/mean": -0.0007812740514054894, + "rewards/progression_diversity/std": 0.008964339271187782, + "rewards/symbolic_reward_accuracy/mean": 0.25390625, + "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, + "rewards/symbolic_reward_partial_score/mean": 0.6870931386947632, + "rewards/symbolic_reward_partial_score/std": 0.22139233350753784, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0451353788375854, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.4013971090316772, + "step": 1433 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.21957285702228546, + "epoch": 2.298076923076923, + "grad_norm": 0.013484461233019829, + "learning_rate": 1e-06, + "loss": -0.0143, + "step": 1434 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.21703750640153885, + "epoch": 2.2996794871794872, + "grad_norm": 0.030732089653611183, + "learning_rate": 1e-06, + "loss": 0.0609, + "step": 1435 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.22398217022418976, + "epoch": 2.301282051282051, + "grad_norm": 0.021893825381994247, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3516.0, + "completions/mean_length": 1687.796875, + "completions/mean_terminated_length": 1601.1788330078125, + "completions/min_length": 342.0, + "completions/min_terminated_length": 342.0, + "entropy": 0.21840395033359528, + "epoch": 2.3028846153846154, + "frac_reward_zero_std": 0.1875, + "grad_norm": 47.56247329711914, + "learning_rate": 1e-06, + "loss": 0.0497, + "num_tokens": 971978193.0, + "reward": 0.3468187153339386, + "reward_std": 0.043784305453300476, + "rewards/progression_diversity/mean": -0.0017242016037926078, + "rewards/progression_diversity/std": 0.017134975641965866, + "rewards/symbolic_reward_accuracy/mean": 0.21875, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.7205728888511658, + "rewards/symbolic_reward_partial_score/std": 0.2209366112947464, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0465869903564453, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 1.2852548360824585, + "step": 1437 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22310731559991837, + "epoch": 2.3044871794871793, + "grad_norm": 0.029605181887745857, + "learning_rate": 1e-06, + "loss": -0.0095, + "step": 1438 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.21875733882188797, + "epoch": 2.3060897435897436, + "grad_norm": 0.022749239578843117, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 1439 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.22598165273666382, + "epoch": 2.3076923076923075, + "grad_norm": 0.014465555548667908, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3771.0, + "completions/mean_length": 1645.763671875, + "completions/mean_terminated_length": 1616.9217529296875, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.22972248494625092, + "epoch": 2.309294871794872, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.02680191770195961, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 973626632.0, + "reward": 0.3762263059616089, + "reward_std": 0.05338224023580551, + "rewards/progression_diversity/mean": -0.0018829565960913897, + "rewards/progression_diversity/std": 0.017473464831709862, + "rewards/symbolic_reward_accuracy/mean": 0.25, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.754150390625, + "rewards/symbolic_reward_partial_score/std": 0.19963204860687256, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0482197999954224, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 1.3025373220443726, + "step": 1441 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.22302179783582687, + "epoch": 2.310897435897436, + "grad_norm": 0.016930239275097847, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 1442 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.22931306064128876, + "epoch": 2.3125, + "grad_norm": 0.017596367746591568, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 1443 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2299589440226555, + "epoch": 2.314102564102564, + "grad_norm": 0.01738305389881134, + "learning_rate": 1e-06, + "loss": -0.0058, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5412.0, + "completions/max_terminated_length": 5412.0, + "completions/mean_length": 1501.36328125, + "completions/mean_terminated_length": 1501.36328125, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.23561769723892212, + "epoch": 2.315705128205128, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.024762306362390518, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 975200130.0, + "reward": 0.39472496509552, + "reward_std": 0.027176737785339355, + "rewards/progression_diversity/mean": -0.0006506302743218839, + "rewards/progression_diversity/std": 0.007446780800819397, + "rewards/symbolic_reward_accuracy/mean": 0.27734375, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.7610839605331421, + "rewards/symbolic_reward_partial_score/std": 0.19756866991519928, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0539369583129883, + "sampling/importance_sampling_ratio/min": 4.62967858494423e-12, + "sampling/sampling_logp_difference/max": 26.098533630371094, + "sampling/sampling_logp_difference/mean": 0.10675007849931717, + "step": 1445 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.24369582533836365, + "epoch": 2.3173076923076925, + "grad_norm": 0.02469770982861519, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 1446 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24448219686746597, + "epoch": 2.3189102564102564, + "grad_norm": 0.018661778420209885, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 1447 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24316756427288055, + "epoch": 2.3205128205128207, + "grad_norm": 0.010404370725154877, + "learning_rate": 1e-06, + "loss": 0.0039, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3668.0, + "completions/mean_length": 1508.080078125, + "completions/mean_terminated_length": 1449.7431640625, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.24711936712265015, + "epoch": 2.3221153846153846, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.029730960726737976, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 976758619.0, + "reward": 0.36418893933296204, + "reward_std": 0.02938855066895485, + "rewards/progression_diversity/mean": -0.0005412165191955864, + "rewards/progression_diversity/std": 0.006034459453076124, + "rewards/symbolic_reward_accuracy/mean": 0.236328125, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.742626965045929, + "rewards/symbolic_reward_partial_score/std": 0.20223668217658997, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0534663200378418, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.6994922161102295, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.24198869615793228, + "epoch": 2.323717948717949, + "grad_norm": 0.018425533547997475, + "learning_rate": 1e-06, + "loss": 0.0282, + "step": 1450 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.24656683206558228, + "epoch": 2.3253205128205128, + "grad_norm": 0.013853712007403374, + "learning_rate": 1e-06, + "loss": -0.0072, + "step": 1451 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.24908402562141418, + "epoch": 2.326923076923077, + "grad_norm": 0.01092120073735714, + "learning_rate": 1e-06, + "loss": 0.0053, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 1472.052734375, + "completions/mean_terminated_length": 1442.870849609375, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "entropy": 0.2532293498516083, + "epoch": 2.328525641025641, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.021764414384961128, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 978359622.0, + "reward": 0.3215084671974182, + "reward_std": 0.039750613272190094, + "rewards/progression_diversity/mean": -0.0005217455327510834, + "rewards/progression_diversity/std": 0.008216914720833302, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.7129882574081421, + "rewards/symbolic_reward_partial_score/std": 0.21137480437755585, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0538359880447388, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.4409805238246918, + "step": 1453 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.24427498131990433, + "epoch": 2.3301282051282053, + "grad_norm": 0.015310117974877357, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 1454 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2432166486978531, + "epoch": 2.331730769230769, + "grad_norm": 895.1507568359375, + "learning_rate": 1e-06, + "loss": 0.046, + "step": 1455 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.24313075095415115, + "epoch": 2.3333333333333335, + "grad_norm": 0.013614215888082981, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3000.0, + "completions/mean_length": 1463.748046875, + "completions/mean_terminated_length": 1434.5499267578125, + "completions/min_length": 353.0, + "completions/min_terminated_length": 353.0, + "entropy": 0.2332357093691826, + "epoch": 2.3349358974358974, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02649446576833725, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 979968421.0, + "reward": 0.33722105622291565, + "reward_std": 0.0333821102976799, + "rewards/progression_diversity/mean": -0.0005536978715099394, + "rewards/progression_diversity/std": 0.006957771256566048, + "rewards/symbolic_reward_accuracy/mean": 0.208984375, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.7067708373069763, + "rewards/symbolic_reward_partial_score/std": 0.19518350064754486, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.05262291431427, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.4595038890838623, + "step": 1457 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.240996353328228, + "epoch": 2.3365384615384617, + "grad_norm": 0.00819353200495243, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 1458 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.23650028556585312, + "epoch": 2.3381410256410255, + "grad_norm": 0.021920593455433846, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 1459 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2370261400938034, + "epoch": 2.33974358974359, + "grad_norm": 0.011823596432805061, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5619.0, + "completions/mean_length": 1463.85546875, + "completions/mean_terminated_length": 1375.91748046875, + "completions/min_length": 301.0, + "completions/min_terminated_length": 301.0, + "entropy": 0.22917187213897705, + "epoch": 2.3413461538461537, + "frac_reward_zero_std": 0.4375, + "grad_norm": 196.0175018310547, + "learning_rate": 1e-06, + "loss": 0.0506, + "num_tokens": 981567035.0, + "reward": 0.39167657494544983, + "reward_std": 0.050342485308647156, + "rewards/progression_diversity/mean": -0.0008015389903448522, + "rewards/progression_diversity/std": 0.0074392156675457954, + "rewards/symbolic_reward_accuracy/mean": 0.2734375, + "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, + "rewards/symbolic_reward_partial_score/mean": 0.7600423097610474, + "rewards/symbolic_reward_partial_score/std": 0.2186768352985382, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0496225357055664, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 740.0, + "sampling/sampling_logp_difference/mean": 0.7763689756393433, + "step": 1461 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.23547165840864182, + "epoch": 2.342948717948718, + "grad_norm": 0.014117385260760784, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 1462 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2310246080160141, + "epoch": 2.344551282051282, + "grad_norm": 0.038323234766721725, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 1463 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2359386458992958, + "epoch": 2.3461538461538463, + "grad_norm": 0.018555141985416412, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3024.0, + "completions/mean_length": 1395.6953125, + "completions/mean_terminated_length": 1336.917724609375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.23501623421907425, + "epoch": 2.34775641025641, + "frac_reward_zero_std": 0.625, + "grad_norm": 378.42333984375, + "learning_rate": 1e-06, + "loss": 0.0315, + "num_tokens": 983101327.0, + "reward": 0.3867397904396057, + "reward_std": 0.011792978271842003, + "rewards/progression_diversity/mean": -0.00033900359994731843, + "rewards/progression_diversity/std": 0.004276837222278118, + "rewards/symbolic_reward_accuracy/mean": 0.248046875, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.7943521738052368, + "rewards/symbolic_reward_partial_score/std": 0.1727304607629776, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0502277612686157, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 0.6608302593231201, + "step": 1465 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.22964345663785934, + "epoch": 2.3493589743589745, + "grad_norm": 0.007282482460141182, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 1466 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.23587816208600998, + "epoch": 2.3509615384615383, + "grad_norm": 0.02002773992717266, + "learning_rate": 1e-06, + "loss": 0.0049, + "step": 1467 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.23493095487356186, + "epoch": 2.3525641025641026, + "grad_norm": 0.007156542036682367, + "learning_rate": 1e-06, + "loss": -0.0054, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2754.0, + "completions/mean_length": 1412.869140625, + "completions/mean_terminated_length": 1354.158935546875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.23347257822752, + "epoch": 2.3541666666666665, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.020152533426880836, + "learning_rate": 1e-06, + "loss": 0.0307, + "num_tokens": 984703036.0, + "reward": 0.3319286108016968, + "reward_std": 0.04241711646318436, + "rewards/progression_diversity/mean": -0.0009903897298499942, + "rewards/progression_diversity/std": 0.011630616150796413, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.7282063961029053, + "rewards/symbolic_reward_partial_score/std": 0.18158793449401855, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0504162311553955, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 736.0, + "sampling/sampling_logp_difference/mean": 0.44282588362693787, + "step": 1469 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.23258361965417862, + "epoch": 2.355769230769231, + "grad_norm": 0.016104018315672874, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 1470 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.22412847727537155, + "epoch": 2.3573717948717947, + "grad_norm": 0.009414435364305973, + "learning_rate": 1e-06, + "loss": 0.0275, + "step": 1471 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.2289385423064232, + "epoch": 2.358974358974359, + "grad_norm": 0.014195187948644161, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2557.0, + "completions/max_terminated_length": 2557.0, + "completions/mean_length": 1360.466796875, + "completions/mean_terminated_length": 1360.466796875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.22527696937322617, + "epoch": 2.360576923076923, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.013988327234983444, + "learning_rate": 1e-06, + "loss": -0.0092, + "num_tokens": 986369547.0, + "reward": 0.31377702951431274, + "reward_std": 0.011640192940831184, + "rewards/progression_diversity/mean": -0.00022870188695378602, + "rewards/progression_diversity/std": 0.0025177807547152042, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.6709309816360474, + "rewards/symbolic_reward_partial_score/std": 0.20638255774974823, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.050323247909546, + "sampling/importance_sampling_ratio/min": 0.003235085867345333, + "sampling/sampling_logp_difference/max": 5.733699798583984, + "sampling/sampling_logp_difference/mean": 0.10156988352537155, + "step": 1473 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2217569351196289, + "epoch": 2.3621794871794872, + "grad_norm": 0.02185184508562088, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 1474 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.22397807240486145, + "epoch": 2.363782051282051, + "grad_norm": 0.018456028774380684, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 1475 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.22414569556713104, + "epoch": 2.3653846153846154, + "grad_norm": 0.012643019668757915, + "learning_rate": 1e-06, + "loss": -0.0055, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2527.0, + "completions/mean_length": 1278.501953125, + "completions/mean_terminated_length": 1189.4715576171875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.2367832437157631, + "epoch": 2.3669871794871793, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01879701018333435, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 987864604.0, + "reward": 0.43188178539276123, + "reward_std": 0.03625435382127762, + "rewards/progression_diversity/mean": -0.00029982085106894374, + "rewards/progression_diversity/std": 0.004407213069498539, + "rewards/symbolic_reward_accuracy/mean": 0.330078125, + "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, + "rewards/symbolic_reward_partial_score/mean": 0.7814127206802368, + "rewards/symbolic_reward_partial_score/std": 0.21501034498214722, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0473359823226929, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 744.0, + "sampling/sampling_logp_difference/mean": 2.7259652614593506, + "step": 1477 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2279703989624977, + "epoch": 2.3685897435897436, + "grad_norm": 30.450448989868164, + "learning_rate": 1e-06, + "loss": 0.0318, + "step": 1478 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.23828475177288055, + "epoch": 2.3701923076923075, + "grad_norm": 0.024164684116840363, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 1479 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.23449359089136124, + "epoch": 2.371794871794872, + "grad_norm": 0.01611475460231304, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3828.0, + "completions/mean_length": 1289.505859375, + "completions/mean_terminated_length": 1230.3118896484375, + "completions/min_length": 347.0, + "completions/min_terminated_length": 347.0, + "entropy": 0.23153682053089142, + "epoch": 2.373397435897436, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02150103822350502, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 989388479.0, + "reward": 0.41440147161483765, + "reward_std": 0.038612596690654755, + "rewards/progression_diversity/mean": -0.0002827388816513121, + "rewards/progression_diversity/std": 0.003904320765286684, + "rewards/symbolic_reward_accuracy/mean": 0.306640625, + "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, + "rewards/symbolic_reward_partial_score/mean": 0.7693685293197632, + "rewards/symbolic_reward_partial_score/std": 0.21836112439632416, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0490467548370361, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 748.0, + "sampling/sampling_logp_difference/mean": 1.6480077505111694, + "step": 1481 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2258523479104042, + "epoch": 2.375, + "grad_norm": 0.009424294345080853, + "learning_rate": 1e-06, + "loss": 0.1124, + "step": 1482 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.23436233401298523, + "epoch": 2.376602564102564, + "grad_norm": 0.010978851467370987, + "learning_rate": 1e-06, + "loss": 0.0041, + "step": 1483 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2368711531162262, + "epoch": 2.378205128205128, + "grad_norm": 0.015040039084851742, + "learning_rate": 1e-06, + "loss": -0.0046, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3070.0, + "completions/mean_length": 1375.23046875, + "completions/mean_terminated_length": 1286.7701416015625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.22265422344207764, + "epoch": 2.3798076923076925, + "frac_reward_zero_std": 0.40625, + "grad_norm": 365.71807861328125, + "learning_rate": 1e-06, + "loss": 0.0797, + "num_tokens": 990941957.0, + "reward": 0.2840445041656494, + "reward_std": 0.026247035712003708, + "rewards/progression_diversity/mean": -0.00033585538039915264, + "rewards/progression_diversity/std": 0.0034242472611367702, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.706591784954071, + "rewards/symbolic_reward_partial_score/std": 0.19762486219406128, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0476056337356567, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 748.0, + "sampling/sampling_logp_difference/mean": 2.300495147705078, + "step": 1485 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2373371571302414, + "epoch": 2.3814102564102564, + "grad_norm": 0.01105690747499466, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1486 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.23985521495342255, + "epoch": 2.3830128205128207, + "grad_norm": 0.015546144917607307, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 1487 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23665790259838104, + "epoch": 2.3846153846153846, + "grad_norm": 0.01926671527326107, + "learning_rate": 1e-06, + "loss": -0.0068, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2707.0, + "completions/mean_length": 1406.267578125, + "completions/mean_terminated_length": 1288.3326416015625, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.23878996819257736, + "epoch": 2.386217948717949, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.019514024257659912, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 992456558.0, + "reward": 0.2826894521713257, + "reward_std": 0.03426457941532135, + "rewards/progression_diversity/mean": -0.0005875998758710921, + "rewards/progression_diversity/std": 0.00664177630096674, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.7027344107627869, + "rewards/symbolic_reward_partial_score/std": 0.1787167340517044, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0489890575408936, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 2.2175588607788086, + "step": 1489 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24255738407373428, + "epoch": 2.3878205128205128, + "grad_norm": 2492.634033203125, + "learning_rate": 1e-06, + "loss": 0.0505, + "step": 1490 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.24588491022586823, + "epoch": 2.389423076923077, + "grad_norm": 0.017873678356409073, + "learning_rate": 1e-06, + "loss": -0.0036, + "step": 1491 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.24287723004817963, + "epoch": 2.391025641025641, + "grad_norm": 0.013864979147911072, + "learning_rate": 1e-06, + "loss": 0.0298, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3078.0, + "completions/mean_length": 1381.619140625, + "completions/mean_terminated_length": 1322.786376953125, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "entropy": 0.23624292761087418, + "epoch": 2.3926282051282053, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.01766468957066536, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 994038987.0, + "reward": 0.3370998203754425, + "reward_std": 0.02162216417491436, + "rewards/progression_diversity/mean": -0.000469559890916571, + "rewards/progression_diversity/std": 0.005131890531629324, + "rewards/symbolic_reward_accuracy/mean": 0.17578125, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.7734212279319763, + "rewards/symbolic_reward_partial_score/std": 0.15896020829677582, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0494273900985718, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 1.574073076248169, + "step": 1493 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.23203369975090027, + "epoch": 2.394230769230769, + "grad_norm": 0.007126044947654009, + "learning_rate": 1e-06, + "loss": 0.0539, + "step": 1494 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.23312392085790634, + "epoch": 2.3958333333333335, + "grad_norm": 0.007795875892043114, + "learning_rate": 1e-06, + "loss": -0.0035, + "step": 1495 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.23666821420192719, + "epoch": 2.3974358974358974, + "grad_norm": 0.007526164874434471, + "learning_rate": 1e-06, + "loss": 0.0024, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2845.0, + "completions/mean_length": 1428.548828125, + "completions/mean_terminated_length": 1281.0592041015625, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "entropy": 0.23371727764606476, + "epoch": 2.3990384615384617, + "frac_reward_zero_std": 0.59375, + "grad_norm": 837.6632080078125, + "learning_rate": 1e-06, + "loss": 0.0729, + "num_tokens": 995634948.0, + "reward": 0.3479752242565155, + "reward_std": 0.019150175154209137, + "rewards/progression_diversity/mean": -0.00033024564618244767, + "rewards/progression_diversity/std": 0.004327813629060984, + "rewards/symbolic_reward_accuracy/mean": 0.216796875, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.7295898199081421, + "rewards/symbolic_reward_partial_score/std": 0.2091808170080185, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.046621322631836, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 3.1054999828338623, + "step": 1497 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.23887716233730316, + "epoch": 2.4006410256410255, + "grad_norm": 7.230476379394531, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 1498 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.24147003889083862, + "epoch": 2.40224358974359, + "grad_norm": 0.013911988586187363, + "learning_rate": 1e-06, + "loss": -0.0055, + "step": 1499 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24635042250156403, + "epoch": 2.4038461538461537, + "grad_norm": 0.013229484669864178, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5214.0, + "completions/mean_length": 1461.9140625, + "completions/mean_terminated_length": 1314.75341796875, + "completions/min_length": 430.0, + "completions/min_terminated_length": 430.0, + "entropy": 0.2452658787369728, + "epoch": 2.405448717948718, + "frac_reward_zero_std": 0.53125, + "grad_norm": 125.90882873535156, + "learning_rate": 1e-06, + "loss": 0.0306, + "num_tokens": 997203848.0, + "reward": 0.2770632207393646, + "reward_std": 0.023135796189308167, + "rewards/progression_diversity/mean": -0.0007097757770679891, + "rewards/progression_diversity/std": 0.01300110761076212, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.6839843988418579, + "rewards/symbolic_reward_partial_score/std": 0.1883450597524643, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0471558570861816, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 2.527804374694824, + "step": 1501 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.23859861493110657, + "epoch": 2.407051282051282, + "grad_norm": 577.0975952148438, + "learning_rate": 1e-06, + "loss": 0.0477, + "step": 1502 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.2435135617852211, + "epoch": 2.4086538461538463, + "grad_norm": 0.014631603844463825, + "learning_rate": 1e-06, + "loss": 0.0252, + "step": 1503 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2454121932387352, + "epoch": 2.41025641025641, + "grad_norm": 0.008022090420126915, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4812.0, + "completions/mean_length": 1448.013671875, + "completions/mean_terminated_length": 1330.407470703125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.24072068184614182, + "epoch": 2.4118589743589745, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.024386148899793625, + "learning_rate": 1e-06, + "loss": -0.0065, + "num_tokens": 998845663.0, + "reward": 0.31266647577285767, + "reward_std": 0.029008327051997185, + "rewards/progression_diversity/mean": -0.00044497710769064724, + "rewards/progression_diversity/std": 0.007153489161282778, + "rewards/symbolic_reward_accuracy/mean": 0.166015625, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.7128092646598816, + "rewards/symbolic_reward_partial_score/std": 0.1801445633172989, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0494557619094849, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 1.1010427474975586, + "step": 1505 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22912582755088806, + "epoch": 2.4134615384615383, + "grad_norm": 0.013357684947550297, + "learning_rate": 1e-06, + "loss": 0.0729, + "step": 1506 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.238428495824337, + "epoch": 2.4150641025641026, + "grad_norm": 0.02128361538052559, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 1507 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.23703129589557648, + "epoch": 2.4166666666666665, + "grad_norm": 0.011308502405881882, + "learning_rate": 1e-06, + "loss": 0.0045, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2632.0, + "completions/mean_length": 1454.595703125, + "completions/mean_terminated_length": 1366.6031494140625, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.23324482142925262, + "epoch": 2.418269230769231, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.020565425977110863, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 1000476448.0, + "reward": 0.35293227434158325, + "reward_std": 0.02206794172525406, + "rewards/progression_diversity/mean": -0.00022909138351678848, + "rewards/progression_diversity/std": 0.0050324564799666405, + "rewards/symbolic_reward_accuracy/mean": 0.21875, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.7409017086029053, + "rewards/symbolic_reward_partial_score/std": 0.1797100305557251, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.047702431678772, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 756.0, + "sampling/sampling_logp_difference/mean": 2.1723339557647705, + "step": 1509 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2380816489458084, + "epoch": 2.4198717948717947, + "grad_norm": 0.009358495473861694, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.23932605236768723, + "epoch": 2.421474358974359, + "grad_norm": 0.012663176283240318, + "learning_rate": 1e-06, + "loss": 0.033, + "step": 1511 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.23995714634656906, + "epoch": 2.423076923076923, + "grad_norm": 0.008335144259035587, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2968.0, + "completions/mean_length": 1581.033203125, + "completions/mean_terminated_length": 1375.8436279296875, + "completions/min_length": 429.0, + "completions/min_terminated_length": 429.0, + "entropy": 0.23756451904773712, + "epoch": 2.4246794871794872, + "frac_reward_zero_std": 0.40625, + "grad_norm": 581.8189697265625, + "learning_rate": 1e-06, + "loss": 0.0544, + "num_tokens": 1002237585.0, + "reward": 0.3079501986503601, + "reward_std": 0.04506230354309082, + "rewards/progression_diversity/mean": -0.0013673059875145555, + "rewards/progression_diversity/std": 0.019772524014115334, + "rewards/symbolic_reward_accuracy/mean": 0.166015625, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.6990722417831421, + "rewards/symbolic_reward_partial_score/std": 0.19391867518424988, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0470722913742065, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 2.45670747756958, + "step": 1513 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2470928058028221, + "epoch": 2.426282051282051, + "grad_norm": 0.01648869179189205, + "learning_rate": 1e-06, + "loss": -0.0059, + "step": 1514 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.23297542333602905, + "epoch": 2.4278846153846154, + "grad_norm": 384.6247253417969, + "learning_rate": 1e-06, + "loss": 0.0991, + "step": 1515 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.23933256417512894, + "epoch": 2.4294871794871793, + "grad_norm": 0.019361699000000954, + "learning_rate": 1e-06, + "loss": 0.0483, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8324.0, + "completions/mean_length": 1477.720703125, + "completions/mean_terminated_length": 1360.348388671875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.2378390207886696, + "epoch": 2.4310897435897436, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.024526890367269516, + "learning_rate": 1e-06, + "loss": 0.0322, + "num_tokens": 1003883634.0, + "reward": 0.3165303170681, + "reward_std": 0.02786339819431305, + "rewards/progression_diversity/mean": -0.000289862509816885, + "rewards/progression_diversity/std": 0.004467545077204704, + "rewards/symbolic_reward_accuracy/mean": 0.146484375, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.7647460699081421, + "rewards/symbolic_reward_partial_score/std": 0.18048354983329773, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0483635663986206, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.4491806030273438, + "step": 1517 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2445373684167862, + "epoch": 2.4326923076923075, + "grad_norm": 0.01748676225543022, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 1518 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.23935458809137344, + "epoch": 2.434294871794872, + "grad_norm": 0.04788840189576149, + "learning_rate": 1e-06, + "loss": 0.0312, + "step": 1519 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.24610072374343872, + "epoch": 2.435897435897436, + "grad_norm": 0.014646312221884727, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3350.0, + "completions/mean_length": 1556.814453125, + "completions/mean_terminated_length": 1469.4244384765625, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "entropy": 0.24429644644260406, + "epoch": 2.4375, + "frac_reward_zero_std": 0.34375, + "grad_norm": 461.6620788574219, + "learning_rate": 1e-06, + "loss": 0.0507, + "num_tokens": 1005568947.0, + "reward": 0.3079223334789276, + "reward_std": 0.038352809846401215, + "rewards/progression_diversity/mean": -0.00024961589951999485, + "rewards/progression_diversity/std": 0.004486089572310448, + "rewards/symbolic_reward_accuracy/mean": 0.14453125, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.739306628704071, + "rewards/symbolic_reward_partial_score/std": 0.18258456885814667, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0502259731292725, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.174125909805298, + "step": 1521 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.24371539801359177, + "epoch": 2.439102564102564, + "grad_norm": 0.015633653849363327, + "learning_rate": 1e-06, + "loss": 0.0287, + "step": 1522 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2456585019826889, + "epoch": 2.440705128205128, + "grad_norm": 0.021190311759710312, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 1523 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.24361753463745117, + "epoch": 2.4423076923076925, + "grad_norm": 0.011759229004383087, + "learning_rate": 1e-06, + "loss": -0.0044, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5319.0, + "completions/mean_length": 1642.1484375, + "completions/mean_terminated_length": 1467.3438720703125, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.24563409388065338, + "epoch": 2.4439102564102564, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1274.161376953125, + "learning_rate": 1e-06, + "loss": 0.0363, + "num_tokens": 1007251439.0, + "reward": 0.3221524655818939, + "reward_std": 0.05205688625574112, + "rewards/progression_diversity/mean": -0.0015531220706179738, + "rewards/progression_diversity/std": 0.016780469566583633, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.6982421875, + "rewards/symbolic_reward_partial_score/std": 0.20122045278549194, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0466976165771484, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.579723834991455, + "step": 1525 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.23505118489265442, + "epoch": 2.4455128205128207, + "grad_norm": 5386.1142578125, + "learning_rate": 1e-06, + "loss": 0.7638, + "step": 1526 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23239125311374664, + "epoch": 2.4471153846153846, + "grad_norm": 0.012965874746441841, + "learning_rate": 1e-06, + "loss": 0.0457, + "step": 1527 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.23721230030059814, + "epoch": 2.448717948717949, + "grad_norm": 0.01935112476348877, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5069.0, + "completions/mean_length": 1434.068359375, + "completions/mean_terminated_length": 1345.954833984375, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "entropy": 0.2473963499069214, + "epoch": 2.4503205128205128, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.016699276864528656, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 1008849506.0, + "reward": 0.3746347427368164, + "reward_std": 0.0481521412730217, + "rewards/progression_diversity/mean": -0.0008823598036542535, + "rewards/progression_diversity/std": 0.01371445506811142, + "rewards/symbolic_reward_accuracy/mean": 0.25390625, + "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, + "rewards/symbolic_reward_partial_score/mean": 0.7429524660110474, + "rewards/symbolic_reward_partial_score/std": 0.21133190393447876, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0495123863220215, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 2.0027153491973877, + "step": 1529 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.24604248255491257, + "epoch": 2.451923076923077, + "grad_norm": 0.012123506516218185, + "learning_rate": 1e-06, + "loss": 0.013, + "step": 1530 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.24536657333374023, + "epoch": 2.453525641025641, + "grad_norm": 0.010596076026558876, + "learning_rate": 1e-06, + "loss": -0.007, + "step": 1531 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.23595212399959564, + "epoch": 2.4551282051282053, + "grad_norm": 0.009966249577701092, + "learning_rate": 1e-06, + "loss": 0.1189, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5517.0, + "completions/mean_length": 1541.44921875, + "completions/mean_terminated_length": 1424.5787353515625, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.23892727494239807, + "epoch": 2.456730769230769, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.019874002784490585, + "learning_rate": 1e-06, + "loss": -0.0075, + "num_tokens": 1010481960.0, + "reward": 0.4098053574562073, + "reward_std": 0.033480141311883926, + "rewards/progression_diversity/mean": -0.001889547100290656, + "rewards/progression_diversity/std": 0.03145535662770271, + "rewards/symbolic_reward_accuracy/mean": 0.30859375, + "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, + "rewards/symbolic_reward_partial_score/mean": 0.7514973878860474, + "rewards/symbolic_reward_partial_score/std": 0.20948295295238495, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0455158948898315, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.3153014183044434, + "step": 1533 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.23280858993530273, + "epoch": 2.4583333333333335, + "grad_norm": 0.01161316316574812, + "learning_rate": 1e-06, + "loss": 0.0201, + "step": 1534 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2302737832069397, + "epoch": 2.4599358974358974, + "grad_norm": 0.030310511589050293, + "learning_rate": 1e-06, + "loss": 0.0335, + "step": 1535 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.23068059235811234, + "epoch": 2.4615384615384617, + "grad_norm": 0.010563918389379978, + "learning_rate": 1e-06, + "loss": 0.054, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4376.0, + "completions/mean_length": 1490.501953125, + "completions/mean_terminated_length": 1373.2303466796875, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.2301376685500145, + "epoch": 2.4631410256410255, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.016749106347560883, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 1012094073.0, + "reward": 0.36312228441238403, + "reward_std": 0.039926785975694656, + "rewards/progression_diversity/mean": -0.0017360053025186062, + "rewards/progression_diversity/std": 0.01971437782049179, + "rewards/symbolic_reward_accuracy/mean": 0.236328125, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.7404134273529053, + "rewards/symbolic_reward_partial_score/std": 0.20340432226657867, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045566439628601, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 2.109564781188965, + "step": 1537 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22669820487499237, + "epoch": 2.46474358974359, + "grad_norm": 0.013202717527747154, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 1538 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.23074190318584442, + "epoch": 2.4663461538461537, + "grad_norm": 0.028765065595507622, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 1539 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.224589504301548, + "epoch": 2.467948717948718, + "grad_norm": 0.021383749321103096, + "learning_rate": 1e-06, + "loss": 0.0617, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4875.0, + "completions/mean_length": 1534.798828125, + "completions/mean_terminated_length": 1388.3570556640625, + "completions/min_length": 402.0, + "completions/min_terminated_length": 402.0, + "entropy": 0.2195545881986618, + "epoch": 2.469551282051282, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.039864543825387955, + "learning_rate": 1e-06, + "loss": 0.0975, + "num_tokens": 1013848578.0, + "reward": 0.2403162568807602, + "reward_std": 0.0261523500084877, + "rewards/progression_diversity/mean": -0.0010903782676905394, + "rewards/progression_diversity/std": 0.020245160907506943, + "rewards/symbolic_reward_accuracy/mean": 0.060546875, + "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, + "rewards/symbolic_reward_partial_score/mean": 0.6832519769668579, + "rewards/symbolic_reward_partial_score/std": 0.18514306843280792, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0433101654052734, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.134794235229492, + "step": 1541 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.22596587240695953, + "epoch": 2.4711538461538463, + "grad_norm": 0.01767021417617798, + "learning_rate": 1e-06, + "loss": -0.0098, + "step": 1542 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.22497133910655975, + "epoch": 2.47275641025641, + "grad_norm": 0.013781007379293442, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1543 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2270878478884697, + "epoch": 2.4743589743589745, + "grad_norm": 0.011908365413546562, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3915.0, + "completions/mean_length": 1276.716796875, + "completions/mean_terminated_length": 1247.152587890625, + "completions/min_length": 390.0, + "completions/min_terminated_length": 390.0, + "entropy": 0.23582519590854645, + "epoch": 2.4759615384615383, + "frac_reward_zero_std": 0.65625, + "grad_norm": 581.7129516601562, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 1015333441.0, + "reward": 0.40914207696914673, + "reward_std": 0.014557499438524246, + "rewards/progression_diversity/mean": -0.0003453929675742984, + "rewards/progression_diversity/std": 0.00676377210766077, + "rewards/symbolic_reward_accuracy/mean": 0.310546875, + "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, + "rewards/symbolic_reward_partial_score/mean": 0.7433756589889526, + "rewards/symbolic_reward_partial_score/std": 0.229752317070961, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0511703491210938, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 0.7563881278038025, + "step": 1545 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.23144873976707458, + "epoch": 2.4775641025641026, + "grad_norm": 0.010454997420310974, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 1546 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2326817438006401, + "epoch": 2.4791666666666665, + "grad_norm": 0.01226350199431181, + "learning_rate": 1e-06, + "loss": -0.0076, + "step": 1547 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.23457026481628418, + "epoch": 2.480769230769231, + "grad_norm": 0.009266077540814877, + "learning_rate": 1e-06, + "loss": -0.0032, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4929.0, + "completions/mean_length": 1437.140625, + "completions/mean_terminated_length": 1289.7357177734375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.21618512272834778, + "epoch": 2.4823717948717947, + "frac_reward_zero_std": 0.34375, + "grad_norm": 437.86102294921875, + "learning_rate": 1e-06, + "loss": 0.0556, + "num_tokens": 1016930265.0, + "reward": 0.3245599865913391, + "reward_std": 0.04626595973968506, + "rewards/progression_diversity/mean": -0.0015235436148941517, + "rewards/progression_diversity/std": 0.01856166310608387, + "rewards/symbolic_reward_accuracy/mean": 0.193359375, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.6984537839889526, + "rewards/symbolic_reward_partial_score/std": 0.2078658640384674, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.044947862625122, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 756.0, + "sampling/sampling_logp_difference/mean": 2.158604145050049, + "step": 1549 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.23416189849376678, + "epoch": 2.483974358974359, + "grad_norm": 0.011794805526733398, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 1550 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2325107902288437, + "epoch": 2.485576923076923, + "grad_norm": 0.018296558409929276, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 1551 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.21691740304231644, + "epoch": 2.4871794871794872, + "grad_norm": 0.03286243975162506, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4565.0, + "completions/mean_length": 1326.94921875, + "completions/mean_terminated_length": 1238.204345703125, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "entropy": 0.21416771411895752, + "epoch": 2.488782051282051, + "frac_reward_zero_std": 0.34375, + "grad_norm": 242.9971160888672, + "learning_rate": 1e-06, + "loss": 0.0253, + "num_tokens": 1018587343.0, + "reward": 0.24987833201885223, + "reward_std": 0.025197582319378853, + "rewards/progression_diversity/mean": -0.0004485528916120529, + "rewards/progression_diversity/std": 0.0056631616316735744, + "rewards/symbolic_reward_accuracy/mean": 0.08984375, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.6552083492279053, + "rewards/symbolic_reward_partial_score/std": 0.1901572048664093, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0427167415618896, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.669922351837158, + "step": 1553 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.21156911551952362, + "epoch": 2.4903846153846154, + "grad_norm": 0.014419353567063808, + "learning_rate": 1e-06, + "loss": 0.0138, + "step": 1554 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.21717411279678345, + "epoch": 2.4919871794871793, + "grad_norm": 0.011920792050659657, + "learning_rate": 1e-06, + "loss": 0.0193, + "step": 1555 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.21699684858322144, + "epoch": 2.4935897435897436, + "grad_norm": 0.011259951628744602, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3890.0, + "completions/mean_length": 1333.09765625, + "completions/mean_terminated_length": 1244.3890380859375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.2051466777920723, + "epoch": 2.4951923076923075, + "frac_reward_zero_std": 0.53125, + "grad_norm": 179.58285522460938, + "learning_rate": 1e-06, + "loss": 0.0567, + "num_tokens": 1020162753.0, + "reward": 0.4125392436981201, + "reward_std": 0.02173442952334881, + "rewards/progression_diversity/mean": -0.0004710496577899903, + "rewards/progression_diversity/std": 0.004797840025275946, + "rewards/symbolic_reward_accuracy/mean": 0.3046875, + "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, + "rewards/symbolic_reward_partial_score/mean": 0.7670735716819763, + "rewards/symbolic_reward_partial_score/std": 0.21079780161380768, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0423927307128906, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 1.7513947486877441, + "step": 1557 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.21501071751117706, + "epoch": 2.496794871794872, + "grad_norm": 0.00948250014334917, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 1558 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2102562114596367, + "epoch": 2.498397435897436, + "grad_norm": 0.007870234549045563, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 1559 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.21096129715442657, + "epoch": 2.5, + "grad_norm": 0.007708584889769554, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4414.0, + "completions/mean_length": 1309.21484375, + "completions/mean_terminated_length": 1220.365478515625, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.20147471129894257, + "epoch": 2.501602564102564, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.02653343416750431, + "learning_rate": 1e-06, + "loss": 0.0258, + "num_tokens": 1021758815.0, + "reward": 0.30301350355148315, + "reward_std": 0.051949888467788696, + "rewards/progression_diversity/mean": -0.003340594470500946, + "rewards/progression_diversity/std": 0.030672553926706314, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.7074218988418579, + "rewards/symbolic_reward_partial_score/std": 0.1824207305908203, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0406644344329834, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.7471396923065186, + "step": 1561 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.20691817998886108, + "epoch": 2.503205128205128, + "grad_norm": 0.018390124663710594, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 1562 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.20330524444580078, + "epoch": 2.5048076923076925, + "grad_norm": 0.01390912476927042, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.20695596933364868, + "epoch": 2.5064102564102564, + "grad_norm": 0.01517625991255045, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4008.0, + "completions/mean_length": 1362.990234375, + "completions/mean_terminated_length": 1214.85400390625, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "entropy": 0.19462278485298157, + "epoch": 2.5080128205128203, + "frac_reward_zero_std": 0.4375, + "grad_norm": 506.2625732421875, + "learning_rate": 1e-06, + "loss": 0.0486, + "num_tokens": 1023359562.0, + "reward": 0.34047943353652954, + "reward_std": 0.035891205072402954, + "rewards/progression_diversity/mean": -0.0008854115731082857, + "rewards/progression_diversity/std": 0.008040647022426128, + "rewards/symbolic_reward_accuracy/mean": 0.205078125, + "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, + "rewards/symbolic_reward_partial_score/mean": 0.7280598878860474, + "rewards/symbolic_reward_partial_score/std": 0.1988910287618637, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.037065029144287, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.887390375137329, + "step": 1565 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.20349303632974625, + "epoch": 2.5096153846153846, + "grad_norm": 0.02085845172405243, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 1566 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.1964193433523178, + "epoch": 2.511217948717949, + "grad_norm": 0.010187533684074879, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 1567 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.20827952772378922, + "epoch": 2.5128205128205128, + "grad_norm": 0.00918852724134922, + "learning_rate": 1e-06, + "loss": 0.0049, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4599.0, + "completions/mean_length": 1323.048828125, + "completions/mean_terminated_length": 1234.281005859375, + "completions/min_length": 421.0, + "completions/min_terminated_length": 421.0, + "entropy": 0.21191302686929703, + "epoch": 2.5144230769230766, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.020111212506890297, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 1024883955.0, + "reward": 0.3286048173904419, + "reward_std": 0.0436086431145668, + "rewards/progression_diversity/mean": -0.0013349888613447547, + "rewards/progression_diversity/std": 0.01073912438005209, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.7184407711029053, + "rewards/symbolic_reward_partial_score/std": 0.20771624147891998, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0429773330688477, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 756.0, + "sampling/sampling_logp_difference/mean": 1.301384687423706, + "step": 1569 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2052868902683258, + "epoch": 2.516025641025641, + "grad_norm": 0.018205396831035614, + "learning_rate": 1e-06, + "loss": 0.0922, + "step": 1570 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.20686771720647812, + "epoch": 2.5176282051282053, + "grad_norm": 0.010167265310883522, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 1571 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.20857567340135574, + "epoch": 2.519230769230769, + "grad_norm": 0.012847342528402805, + "learning_rate": 1e-06, + "loss": 0.026, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2660.0, + "completions/mean_length": 1316.029296875, + "completions/mean_terminated_length": 1197.3839111328125, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.2158561423420906, + "epoch": 2.5208333333333335, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.013560623861849308, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 1026358946.0, + "reward": 0.29157090187072754, + "reward_std": 0.04181563854217529, + "rewards/progression_diversity/mean": -0.0011149711208418012, + "rewards/progression_diversity/std": 0.010894270613789558, + "rewards/symbolic_reward_accuracy/mean": 0.130859375, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.7115234136581421, + "rewards/symbolic_reward_partial_score/std": 0.1730296015739441, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0387065410614014, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.532683849334717, + "step": 1573 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.20614393055438995, + "epoch": 2.5224358974358974, + "grad_norm": 0.023240137845277786, + "learning_rate": 1e-06, + "loss": 0.0632, + "step": 1574 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.21372488886117935, + "epoch": 2.5240384615384617, + "grad_norm": 0.016561856493353844, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 1575 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.209242083132267, + "epoch": 2.5256410256410255, + "grad_norm": 0.011606593616306782, + "learning_rate": 1e-06, + "loss": 0.0508, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4388.0, + "completions/mean_length": 1337.361328125, + "completions/mean_terminated_length": 1218.8839111328125, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.21050991117954254, + "epoch": 2.52724358974359, + "frac_reward_zero_std": 0.34375, + "grad_norm": 682.1195068359375, + "learning_rate": 1e-06, + "loss": 0.0348, + "num_tokens": 1027800187.0, + "reward": 0.3529677093029022, + "reward_std": 0.03494004160165787, + "rewards/progression_diversity/mean": -0.002058391459286213, + "rewards/progression_diversity/std": 0.028598375618457794, + "rewards/symbolic_reward_accuracy/mean": 0.21875, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.7410807609558105, + "rewards/symbolic_reward_partial_score/std": 0.19470229744911194, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0408451557159424, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.35636568069458, + "step": 1577 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.20368197560310364, + "epoch": 2.5288461538461537, + "grad_norm": 0.014921136200428009, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 1578 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.21754014492034912, + "epoch": 2.530448717948718, + "grad_norm": 0.018416838720440865, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 1579 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.21085964888334274, + "epoch": 2.532051282051282, + "grad_norm": 0.022706199437379837, + "learning_rate": 1e-06, + "loss": 0.0528, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4464.0, + "completions/mean_length": 1427.904296875, + "completions/mean_terminated_length": 1250.559326171875, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "entropy": 0.21092313528060913, + "epoch": 2.5336538461538463, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.020028864964842796, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 1029409994.0, + "reward": 0.32646113634109497, + "reward_std": 0.04266618192195892, + "rewards/progression_diversity/mean": -0.0013511213473975658, + "rewards/progression_diversity/std": 0.012499148957431316, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.7243163585662842, + "rewards/symbolic_reward_partial_score/std": 0.20394515991210938, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0392813682556152, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.5135602951049805, + "step": 1581 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.20285531133413315, + "epoch": 2.53525641025641, + "grad_norm": 0.013795009814202785, + "learning_rate": 1e-06, + "loss": 0.035, + "step": 1582 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2033623531460762, + "epoch": 2.5368589743589745, + "grad_norm": 0.010789959691464901, + "learning_rate": 1e-06, + "loss": 0.0538, + "step": 1583 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2029578685760498, + "epoch": 2.5384615384615383, + "grad_norm": 0.009514546021819115, + "learning_rate": 1e-06, + "loss": 0.0284, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3833.0, + "completions/mean_length": 1337.1015625, + "completions/mean_terminated_length": 1218.6220703125, + "completions/min_length": 364.0, + "completions/min_terminated_length": 364.0, + "entropy": 0.19549165666103363, + "epoch": 2.5400641025641026, + "frac_reward_zero_std": 0.3125, + "grad_norm": 663.7005004882812, + "learning_rate": 1e-06, + "loss": 0.0671, + "num_tokens": 1031023614.0, + "reward": 0.3425098657608032, + "reward_std": 0.050741590559482574, + "rewards/progression_diversity/mean": -0.0014545343583449721, + "rewards/progression_diversity/std": 0.01686347834765911, + "rewards/symbolic_reward_accuracy/mean": 0.220703125, + "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, + "rewards/symbolic_reward_partial_score/mean": 0.7029459476470947, + "rewards/symbolic_reward_partial_score/std": 0.21629534661769867, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0390223264694214, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.6212563514709473, + "step": 1585 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2048923447728157, + "epoch": 2.5416666666666665, + "grad_norm": 0.015606733970344067, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 1586 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.19919227808713913, + "epoch": 2.543269230769231, + "grad_norm": 0.02132081612944603, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 1587 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2042047083377838, + "epoch": 2.5448717948717947, + "grad_norm": 0.01672261208295822, + "learning_rate": 1e-06, + "loss": -0.0112, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3849.0, + "completions/mean_length": 1405.666015625, + "completions/mean_terminated_length": 1287.726318359375, + "completions/min_length": 417.0, + "completions/min_terminated_length": 417.0, + "entropy": 0.20447014272212982, + "epoch": 2.546474358974359, + "frac_reward_zero_std": 0.40625, + "grad_norm": 262.0090637207031, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 1032674947.0, + "reward": 0.3156275749206543, + "reward_std": 0.04013008996844292, + "rewards/progression_diversity/mean": -0.0007197089726105332, + "rewards/progression_diversity/std": 0.008052974939346313, + "rewards/symbolic_reward_accuracy/mean": 0.181640625, + "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, + "rewards/symbolic_reward_partial_score/mean": 0.6914387941360474, + "rewards/symbolic_reward_partial_score/std": 0.2133348137140274, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0389673709869385, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.4872024059295654, + "step": 1589 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.1969490796327591, + "epoch": 2.5480769230769234, + "grad_norm": 0.020587697625160217, + "learning_rate": 1e-06, + "loss": 0.0394, + "step": 1590 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2012443020939827, + "epoch": 2.5496794871794872, + "grad_norm": 0.012877855449914932, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 1591 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2060464546084404, + "epoch": 2.551282051282051, + "grad_norm": 0.017594829201698303, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4988.0, + "completions/mean_length": 1498.439453125, + "completions/mean_terminated_length": 1351.6390380859375, + "completions/min_length": 495.0, + "completions/min_terminated_length": 495.0, + "entropy": 0.20640329271554947, + "epoch": 2.5528846153846154, + "frac_reward_zero_std": 0.28125, + "grad_norm": 406.1407470703125, + "learning_rate": 1e-06, + "loss": 0.0347, + "num_tokens": 1034189668.0, + "reward": 0.30137068033218384, + "reward_std": 0.058662112802267075, + "rewards/progression_diversity/mean": -0.00013947187107987702, + "rewards/progression_diversity/std": 0.0018558743176981807, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.6946777105331421, + "rewards/symbolic_reward_partial_score/std": 0.20915855467319489, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0420677661895752, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 756.0, + "sampling/sampling_logp_difference/mean": 1.9947669506072998, + "step": 1593 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.21118755638599396, + "epoch": 2.5544871794871797, + "grad_norm": 1052.406494140625, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 1594 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2124936804175377, + "epoch": 2.5560897435897436, + "grad_norm": 0.026562146842479706, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 1595 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.21463356912136078, + "epoch": 2.5576923076923075, + "grad_norm": 0.01585652120411396, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4207.0, + "completions/mean_length": 1389.62890625, + "completions/mean_terminated_length": 1330.8275146484375, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.2117195725440979, + "epoch": 2.559294871794872, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02877124771475792, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 1035825670.0, + "reward": 0.3299679160118103, + "reward_std": 0.03433844819664955, + "rewards/progression_diversity/mean": -0.00028184783877804875, + "rewards/progression_diversity/std": 0.003830707399174571, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.7262043952941895, + "rewards/symbolic_reward_partial_score/std": 0.18292047083377838, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0448970794677734, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 0.6958411931991577, + "step": 1597 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.20743755996227264, + "epoch": 2.560897435897436, + "grad_norm": 0.028087658807635307, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 1598 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.20653068274259567, + "epoch": 2.5625, + "grad_norm": 0.014355894178152084, + "learning_rate": 1e-06, + "loss": 0.0279, + "step": 1599 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2106596827507019, + "epoch": 2.564102564102564, + "grad_norm": 0.01578899845480919, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3138.0, + "completions/mean_length": 1489.35546875, + "completions/mean_terminated_length": 1282.8951416015625, + "completions/min_length": 453.0, + "completions/min_terminated_length": 453.0, + "entropy": 0.21462351083755493, + "epoch": 2.565705128205128, + "frac_reward_zero_std": 0.40625, + "grad_norm": 820.033935546875, + "learning_rate": 1e-06, + "loss": 0.0379, + "num_tokens": 1037399484.0, + "reward": 0.3129954934120178, + "reward_std": 0.025571607053279877, + "rewards/progression_diversity/mean": -0.0007452977006323636, + "rewards/progression_diversity/std": 0.01137583889067173, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.7347493171691895, + "rewards/symbolic_reward_partial_score/std": 0.177503302693367, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0383716821670532, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.7996788024902344, + "step": 1601 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21920231729745865, + "epoch": 2.5673076923076925, + "grad_norm": 0.01972138322889805, + "learning_rate": 1e-06, + "loss": 0.036, + "step": 1602 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.21449632197618484, + "epoch": 2.5689102564102564, + "grad_norm": 0.015361227095127106, + "learning_rate": 1e-06, + "loss": 0.0557, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.21294260025024414, + "epoch": 2.5705128205128203, + "grad_norm": 0.009509088471531868, + "learning_rate": 1e-06, + "loss": 0.0432, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5541.0, + "completions/mean_length": 1318.08984375, + "completions/mean_terminated_length": 1199.4605712890625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.21971651911735535, + "epoch": 2.5721153846153846, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.024938074871897697, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 1039039034.0, + "reward": 0.29758530855178833, + "reward_std": 0.030321484431624413, + "rewards/progression_diversity/mean": -0.0017264732159674168, + "rewards/progression_diversity/std": 0.027802200987935066, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.6899251341819763, + "rewards/symbolic_reward_partial_score/std": 0.21471048891544342, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0427124500274658, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 752.0, + "sampling/sampling_logp_difference/mean": 3.036716938018799, + "step": 1605 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2219068706035614, + "epoch": 2.573717948717949, + "grad_norm": 0.012791654095053673, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 1606 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.20912151038646698, + "epoch": 2.5753205128205128, + "grad_norm": 0.008532814681529999, + "learning_rate": 1e-06, + "loss": 0.033, + "step": 1607 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.22284353524446487, + "epoch": 2.5769230769230766, + "grad_norm": 0.019355937838554382, + "learning_rate": 1e-06, + "loss": 0.0272, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4659.0, + "completions/mean_length": 1479.640625, + "completions/mean_terminated_length": 1302.9091796875, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 0.22081206738948822, + "epoch": 2.578525641025641, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.018469780683517456, + "learning_rate": 1e-06, + "loss": 0.0248, + "num_tokens": 1040598594.0, + "reward": 0.41613101959228516, + "reward_std": 0.07369263470172882, + "rewards/progression_diversity/mean": -0.0035969598684459925, + "rewards/progression_diversity/std": 0.033173561096191406, + "rewards/symbolic_reward_accuracy/mean": 0.306640625, + "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, + "rewards/symbolic_reward_partial_score/mean": 0.7778483033180237, + "rewards/symbolic_reward_partial_score/std": 0.20071126520633698, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0395357608795166, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 756.0, + "sampling/sampling_logp_difference/mean": 4.858417510986328, + "step": 1609 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2339448481798172, + "epoch": 2.5801282051282053, + "grad_norm": 0.021145859733223915, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 1610 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.22270727157592773, + "epoch": 2.581730769230769, + "grad_norm": 0.014868995174765587, + "learning_rate": 1e-06, + "loss": 0.0422, + "step": 1611 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.21710411459207535, + "epoch": 2.5833333333333335, + "grad_norm": 1147.5833740234375, + "learning_rate": 1e-06, + "loss": 0.2004, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5854.0, + "completions/mean_length": 1488.01953125, + "completions/mean_terminated_length": 1311.387451171875, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 0.2146845906972885, + "epoch": 2.5849358974358974, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.021604053676128387, + "learning_rate": 1e-06, + "loss": 0.0206, + "num_tokens": 1042269212.0, + "reward": 0.3228908181190491, + "reward_std": 0.06467078626155853, + "rewards/progression_diversity/mean": -0.0019355263793841004, + "rewards/progression_diversity/std": 0.025548765435814857, + "rewards/symbolic_reward_accuracy/mean": 0.16015625, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.7599608898162842, + "rewards/symbolic_reward_partial_score/std": 0.1819555014371872, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0413014888763428, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.9018664360046387, + "step": 1613 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.20722515881061554, + "epoch": 2.5865384615384617, + "grad_norm": 440.9079895019531, + "learning_rate": 1e-06, + "loss": 0.1145, + "step": 1614 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.20744751393795013, + "epoch": 2.5881410256410255, + "grad_norm": 2084.2294921875, + "learning_rate": 1e-06, + "loss": 0.2864, + "step": 1615 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.22113841027021408, + "epoch": 2.58974358974359, + "grad_norm": 0.01803417317569256, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3889.0, + "completions/mean_length": 1386.021484375, + "completions/mean_terminated_length": 1297.624755859375, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "entropy": 0.21602119505405426, + "epoch": 2.5913461538461537, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0642000138759613, + "learning_rate": 1e-06, + "loss": 0.0316, + "num_tokens": 1043874039.0, + "reward": 0.2649605870246887, + "reward_std": 0.02608659118413925, + "rewards/progression_diversity/mean": -0.0019893196877092123, + "rewards/progression_diversity/std": 0.021042335778474808, + "rewards/symbolic_reward_accuracy/mean": 0.095703125, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.693164050579071, + "rewards/symbolic_reward_partial_score/std": 0.17582449316978455, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0435926914215088, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.047320604324341, + "step": 1617 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2237110733985901, + "epoch": 2.592948717948718, + "grad_norm": 0.022002186626195908, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 1618 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.21986420452594757, + "epoch": 2.594551282051282, + "grad_norm": 0.01582951843738556, + "learning_rate": 1e-06, + "loss": 0.0343, + "step": 1619 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.22045395523309708, + "epoch": 2.5961538461538463, + "grad_norm": 0.021329455077648163, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2783.0, + "completions/mean_length": 1341.115234375, + "completions/mean_terminated_length": 1311.6771240234375, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.22125251591205597, + "epoch": 2.59775641025641, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.018027139827609062, + "learning_rate": 1e-06, + "loss": 0.0207, + "num_tokens": 1045449474.0, + "reward": 0.36266857385635376, + "reward_std": 0.047155484557151794, + "rewards/progression_diversity/mean": -0.0007219260442070663, + "rewards/progression_diversity/std": 0.00793998222798109, + "rewards/symbolic_reward_accuracy/mean": 0.25390625, + "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, + "rewards/symbolic_reward_partial_score/mean": 0.7017577886581421, + "rewards/symbolic_reward_partial_score/std": 0.2232026308774948, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0492632389068604, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 0.9296596050262451, + "step": 1621 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.22497374564409256, + "epoch": 2.5993589743589745, + "grad_norm": 0.01935565285384655, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 1622 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22052931785583496, + "epoch": 2.6009615384615383, + "grad_norm": 0.007344152312725782, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 1623 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.21955804526805878, + "epoch": 2.6025641025641026, + "grad_norm": 0.013848908245563507, + "learning_rate": 1e-06, + "loss": 0.0017, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3666.0, + "completions/mean_length": 1220.310546875, + "completions/mean_terminated_length": 1190.635986328125, + "completions/min_length": 414.0, + "completions/min_terminated_length": 414.0, + "entropy": 0.23253807425498962, + "epoch": 2.6041666666666665, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.01905795931816101, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 1046961025.0, + "reward": 0.31768307089805603, + "reward_std": 0.016084838658571243, + "rewards/progression_diversity/mean": -0.0002500782429706305, + "rewards/progression_diversity/std": 0.003307226812466979, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.6885091066360474, + "rewards/symbolic_reward_partial_score/std": 0.20202499628067017, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0517390966415405, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 756.0, + "sampling/sampling_logp_difference/mean": 0.775344729423523, + "step": 1625 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2306702509522438, + "epoch": 2.605769230769231, + "grad_norm": 0.009856280870735645, + "learning_rate": 1e-06, + "loss": 0.0303, + "step": 1626 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22651749849319458, + "epoch": 2.6073717948717947, + "grad_norm": 0.03335234895348549, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 1627 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.23627790808677673, + "epoch": 2.608974358974359, + "grad_norm": 0.01697065308690071, + "learning_rate": 1e-06, + "loss": -0.0023, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10288.0, + "completions/mean_length": 1333.587890625, + "completions/mean_terminated_length": 1274.5667724609375, + "completions/min_length": 498.0, + "completions/min_terminated_length": 498.0, + "entropy": 0.2252894639968872, + "epoch": 2.6105769230769234, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.10012649744749069, + "learning_rate": 1e-06, + "loss": 0.0249, + "num_tokens": 1048435198.0, + "reward": 0.3364197015762329, + "reward_std": 0.03154347091913223, + "rewards/progression_diversity/mean": -0.0025622413959354162, + "rewards/progression_diversity/std": 0.030175259336829185, + "rewards/symbolic_reward_accuracy/mean": 0.208984375, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.7035156488418579, + "rewards/symbolic_reward_partial_score/std": 0.19566431641578674, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0484051704406738, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.8174188137054443, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.23386212438344955, + "epoch": 2.6121794871794872, + "grad_norm": 0.015600372105836868, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 1630 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.23984410613775253, + "epoch": 2.613782051282051, + "grad_norm": 0.0189208984375, + "learning_rate": 1e-06, + "loss": -0.0074, + "step": 1631 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.22813158482313156, + "epoch": 2.6153846153846154, + "grad_norm": 0.01574343629181385, + "learning_rate": 1e-06, + "loss": 0.0725, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3291.0, + "completions/max_terminated_length": 3291.0, + "completions/mean_length": 1187.220703125, + "completions/mean_terminated_length": 1187.220703125, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.23443499207496643, + "epoch": 2.6169871794871797, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.015873905271291733, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 1049949615.0, + "reward": 0.33012884855270386, + "reward_std": 0.014471746981143951, + "rewards/progression_diversity/mean": -0.0003017832641489804, + "rewards/progression_diversity/std": 0.0035524494014680386, + "rewards/symbolic_reward_accuracy/mean": 0.181640625, + "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, + "rewards/symbolic_reward_partial_score/mean": 0.7371581792831421, + "rewards/symbolic_reward_partial_score/std": 0.1770060658454895, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0539133548736572, + "sampling/importance_sampling_ratio/min": 0.0032137008383870125, + "sampling/sampling_logp_difference/max": 5.740332126617432, + "sampling/sampling_logp_difference/mean": 0.1061110571026802, + "step": 1633 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2335355207324028, + "epoch": 2.6185897435897436, + "grad_norm": 0.008864963427186012, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.23033030331134796, + "epoch": 2.6201923076923075, + "grad_norm": 0.015018898993730545, + "learning_rate": 1e-06, + "loss": -0.0089, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.23148179799318314, + "epoch": 2.621794871794872, + "grad_norm": 0.008959776721894741, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2425.0, + "completions/mean_length": 1215.390625, + "completions/mean_terminated_length": 1155.906005859375, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.2370157241821289, + "epoch": 2.623397435897436, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01420600526034832, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 1051439463.0, + "reward": 0.3070603907108307, + "reward_std": 0.03020370379090309, + "rewards/progression_diversity/mean": -0.0019726285245269537, + "rewards/progression_diversity/std": 0.030601099133491516, + "rewards/symbolic_reward_accuracy/mean": 0.16796875, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.68896484375, + "rewards/symbolic_reward_partial_score/std": 0.1941937357187271, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0498253107070923, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.9474784135818481, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.23885345458984375, + "epoch": 2.625, + "grad_norm": 0.011274533346295357, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 1638 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.23263810575008392, + "epoch": 2.626602564102564, + "grad_norm": 0.008748682215809822, + "learning_rate": 1e-06, + "loss": 0.0306, + "step": 1639 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.23605605959892273, + "epoch": 2.628205128205128, + "grad_norm": 0.020408501848578453, + "learning_rate": 1e-06, + "loss": -0.0049, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2093.0, + "completions/mean_length": 1151.375, + "completions/mean_terminated_length": 1121.5655517578125, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "entropy": 0.24552424252033234, + "epoch": 2.6298076923076925, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.022998787462711334, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 1052812775.0, + "reward": 0.3075063228607178, + "reward_std": 0.009556922130286694, + "rewards/progression_diversity/mean": -0.0008343125809915364, + "rewards/progression_diversity/std": 0.018091805279254913, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.7125488519668579, + "rewards/symbolic_reward_partial_score/std": 0.18109886348247528, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0533249378204346, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.3622500896453857, + "step": 1641 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2431017979979515, + "epoch": 2.6314102564102564, + "grad_norm": 0.019999193027615547, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 1642 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24063821882009506, + "epoch": 2.6330128205128203, + "grad_norm": 0.01059049554169178, + "learning_rate": 1e-06, + "loss": -0.0094, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.24199344217777252, + "epoch": 2.6346153846153846, + "grad_norm": 0.015171530656516552, + "learning_rate": 1e-06, + "loss": 0.0055, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1190.326171875, + "completions/mean_terminated_length": 1160.5928955078125, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.24322400987148285, + "epoch": 2.636217948717949, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.021913990378379822, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 1054322382.0, + "reward": 0.30744144320487976, + "reward_std": 0.010562664829194546, + "rewards/progression_diversity/mean": -0.0009744351846165955, + "rewards/progression_diversity/std": 0.020204655826091766, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.7129882574081421, + "rewards/symbolic_reward_partial_score/std": 0.17659372091293335, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.053013801574707, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.5290266275405884, + "step": 1645 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2419656366109848, + "epoch": 2.6378205128205128, + "grad_norm": 0.02146642841398716, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1646 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24547536671161652, + "epoch": 2.6394230769230766, + "grad_norm": 0.012392179109156132, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 1647 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.23846372961997986, + "epoch": 2.641025641025641, + "grad_norm": 1201.439208984375, + "learning_rate": 1e-06, + "loss": 0.034, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2423.0, + "completions/mean_length": 1209.61328125, + "completions/mean_terminated_length": 1179.9178466796875, + "completions/min_length": 458.0, + "completions/min_terminated_length": 458.0, + "entropy": 0.24202610552310944, + "epoch": 2.6426282051282053, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.014862586744129658, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 1055858648.0, + "reward": 0.3161856532096863, + "reward_std": 0.019514337182044983, + "rewards/progression_diversity/mean": -0.00106208142824471, + "rewards/progression_diversity/std": 0.018535632640123367, + "rewards/symbolic_reward_accuracy/mean": 0.19140625, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.6711751222610474, + "rewards/symbolic_reward_partial_score/std": 0.2153313010931015, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0529862642288208, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.4675102233886719, + "step": 1649 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2419712170958519, + "epoch": 2.644230769230769, + "grad_norm": 0.009646804071962833, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 1650 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.24016830325126648, + "epoch": 2.6458333333333335, + "grad_norm": 0.02187509275972843, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 1651 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2460290938615799, + "epoch": 2.6474358974358974, + "grad_norm": 0.018824264407157898, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3980.0, + "completions/max_terminated_length": 3980.0, + "completions/mean_length": 1227.869140625, + "completions/mean_terminated_length": 1227.869140625, + "completions/min_length": 431.0, + "completions/min_terminated_length": 431.0, + "entropy": 0.24531883001327515, + "epoch": 2.6490384615384617, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.15125811100006104, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 1057351989.0, + "reward": 0.260877788066864, + "reward_std": 0.027386680245399475, + "rewards/progression_diversity/mean": -0.00011138351692352444, + "rewards/progression_diversity/std": 0.0017274393467232585, + "rewards/symbolic_reward_accuracy/mean": 0.09765625, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.6742838621139526, + "rewards/symbolic_reward_partial_score/std": 0.16698527336120605, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0578720569610596, + "sampling/importance_sampling_ratio/min": 0.0001738879072945565, + "sampling/sampling_logp_difference/max": 8.657099723815918, + "sampling/sampling_logp_difference/mean": 0.11238566040992737, + "step": 1653 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2520115375518799, + "epoch": 2.6506410256410255, + "grad_norm": 0.016538454219698906, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.25293679535388947, + "epoch": 2.65224358974359, + "grad_norm": 0.020968111231923103, + "learning_rate": 1e-06, + "loss": -0.0085, + "step": 1655 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.24919523298740387, + "epoch": 2.6538461538461537, + "grad_norm": 0.009686674922704697, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2423.0, + "completions/mean_length": 1314.712890625, + "completions/mean_terminated_length": 1225.8958740234375, + "completions/min_length": 391.0, + "completions/min_terminated_length": 391.0, + "entropy": 0.25128719210624695, + "epoch": 2.655448717948718, + "frac_reward_zero_std": 0.5, + "grad_norm": 296.33270263671875, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 1058869346.0, + "reward": 0.3539027273654938, + "reward_std": 0.02425507828593254, + "rewards/progression_diversity/mean": -0.00230794376693666, + "rewards/progression_diversity/std": 0.02924177795648575, + "rewards/symbolic_reward_accuracy/mean": 0.236328125, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.7077473998069763, + "rewards/symbolic_reward_partial_score/std": 0.22301146388053894, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0505740642547607, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.750328540802002, + "step": 1657 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24680255353450775, + "epoch": 2.657051282051282, + "grad_norm": 0.022292504087090492, + "learning_rate": 1e-06, + "loss": 0.0275, + "step": 1658 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2595788687467575, + "epoch": 2.6586538461538463, + "grad_norm": 0.007560514844954014, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1659 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.25351230800151825, + "epoch": 2.66025641025641, + "grad_norm": 0.011664212681353092, + "learning_rate": 1e-06, + "loss": 0.0056, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2635.0, + "completions/mean_length": 1181.86328125, + "completions/mean_terminated_length": 1152.113525390625, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.2603015750646591, + "epoch": 2.6618589743589745, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.01761663518846035, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 1060424988.0, + "reward": 0.3372478485107422, + "reward_std": 0.02446237951517105, + "rewards/progression_diversity/mean": -0.0007998401415534317, + "rewards/progression_diversity/std": 0.01764107309281826, + "rewards/symbolic_reward_accuracy/mean": 0.181640625, + "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, + "rewards/symbolic_reward_partial_score/mean": 0.7609050273895264, + "rewards/symbolic_reward_partial_score/std": 0.18756793439388275, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0561124086380005, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.5122106075286865, + "step": 1661 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2552846670150757, + "epoch": 2.6634615384615383, + "grad_norm": 0.009117074310779572, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 1662 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.24909861385822296, + "epoch": 2.6650641025641026, + "grad_norm": 0.019102146849036217, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 1663 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.25790928304195404, + "epoch": 2.6666666666666665, + "grad_norm": 0.010081185959279537, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2593.0, + "completions/max_terminated_length": 2593.0, + "completions/mean_length": 1168.1640625, + "completions/mean_terminated_length": 1168.1640625, + "completions/min_length": 350.0, + "completions/min_terminated_length": 350.0, + "entropy": 0.26866577565670013, + "epoch": 2.668269230769231, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.02700965479016304, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 1061838736.0, + "reward": 0.32009169459342957, + "reward_std": 0.021595628932118416, + "rewards/progression_diversity/mean": -0.00010825111530721188, + "rewards/progression_diversity/std": 0.0018366762669757009, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.6919758915901184, + "rewards/symbolic_reward_partial_score/std": 0.20659130811691284, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0602461099624634, + "sampling/importance_sampling_ratio/min": 7.974762411322445e-05, + "sampling/sampling_logp_difference/max": 9.436643600463867, + "sampling/sampling_logp_difference/mean": 0.11774517595767975, + "step": 1665 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2683710306882858, + "epoch": 2.6698717948717947, + "grad_norm": 0.02114093489944935, + "learning_rate": 1e-06, + "loss": -0.0047, + "step": 1666 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.268236443400383, + "epoch": 2.671474358974359, + "grad_norm": 0.009307913482189178, + "learning_rate": 1e-06, + "loss": 0.0008, + "step": 1667 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2630407512187958, + "epoch": 2.6730769230769234, + "grad_norm": 0.01072956994175911, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2049.0, + "completions/max_terminated_length": 2049.0, + "completions/mean_length": 1157.5234375, + "completions/mean_terminated_length": 1157.5234375, + "completions/min_length": 318.0, + "completions/min_terminated_length": 318.0, + "entropy": 0.26273058354854584, + "epoch": 2.6746794871794872, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.022477174177765846, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 1063326540.0, + "reward": 0.3207314610481262, + "reward_std": 0.03389011323451996, + "rewards/progression_diversity/mean": -9.633351874072105e-05, + "rewards/progression_diversity/std": 0.0015180562622845173, + "rewards/symbolic_reward_accuracy/mean": 0.17578125, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.7175455689430237, + "rewards/symbolic_reward_partial_score/std": 0.2033698707818985, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0595388412475586, + "sampling/importance_sampling_ratio/min": 0.0003233875031583011, + "sampling/sampling_logp_difference/max": 8.036659240722656, + "sampling/sampling_logp_difference/mean": 0.11578333377838135, + "step": 1669 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.26235876977443695, + "epoch": 2.676282051282051, + "grad_norm": 0.01767921820282936, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 1670 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2589954137802124, + "epoch": 2.6778846153846154, + "grad_norm": 0.01393952313810587, + "learning_rate": 1e-06, + "loss": 0.0022, + "step": 1671 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2695692628622055, + "epoch": 2.6794871794871797, + "grad_norm": 0.02042005956172943, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2282.0, + "completions/max_terminated_length": 2282.0, + "completions/mean_length": 1089.64453125, + "completions/mean_terminated_length": 1089.64453125, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.2677207440137863, + "epoch": 2.6810897435897436, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.010551492683589458, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 1064821670.0, + "reward": 0.2943932116031647, + "reward_std": 0.024308741092681885, + "rewards/progression_diversity/mean": -0.00013180242967791855, + "rewards/progression_diversity/std": 0.0023829475976526737, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.7156900763511658, + "rewards/symbolic_reward_partial_score/std": 0.18010446429252625, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.060936450958252, + "sampling/importance_sampling_ratio/min": 0.002229129895567894, + "sampling/sampling_logp_difference/max": 6.106143951416016, + "sampling/sampling_logp_difference/mean": 0.11975084245204926, + "step": 1673 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2699047178030014, + "epoch": 2.6826923076923075, + "grad_norm": 0.017179284244775772, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 1674 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.26817895472049713, + "epoch": 2.684294871794872, + "grad_norm": 0.022273728623986244, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1675 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.27202919125556946, + "epoch": 2.685897435897436, + "grad_norm": 0.007613576482981443, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2164.0, + "completions/max_terminated_length": 2164.0, + "completions/mean_length": 1106.2890625, + "completions/mean_terminated_length": 1106.2890625, + "completions/min_length": 440.0, + "completions/min_terminated_length": 440.0, + "entropy": 0.27295637130737305, + "epoch": 2.6875, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.018397819250822067, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 1066328586.0, + "reward": 0.21672746539115906, + "reward_std": 0.02070911042392254, + "rewards/progression_diversity/mean": -0.00010542837844695896, + "rewards/progression_diversity/std": 0.0020016725175082684, + "rewards/symbolic_reward_accuracy/mean": 0.02734375, + "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, + "rewards/symbolic_reward_partial_score/mean": 0.6677408814430237, + "rewards/symbolic_reward_partial_score/std": 0.1652555614709854, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.062442421913147, + "sampling/importance_sampling_ratio/min": 0.002278130268678069, + "sampling/sampling_logp_difference/max": 6.084400177001953, + "sampling/sampling_logp_difference/mean": 0.12101897597312927, + "step": 1677 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2777542769908905, + "epoch": 2.689102564102564, + "grad_norm": 0.009125451557338238, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 1678 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.273501455783844, + "epoch": 2.690705128205128, + "grad_norm": 0.010436906479299068, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 1679 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2719372361898422, + "epoch": 2.6923076923076925, + "grad_norm": 0.012276146560907364, + "learning_rate": 1e-06, + "loss": -0.0069, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2715.0, + "completions/max_terminated_length": 2715.0, + "completions/mean_length": 1127.42578125, + "completions/mean_terminated_length": 1127.42578125, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.27352291345596313, + "epoch": 2.6939102564102564, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.020503515377640724, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 1067652580.0, + "reward": 0.39523687958717346, + "reward_std": 0.03253398835659027, + "rewards/progression_diversity/mean": -0.0002392895403318107, + "rewards/progression_diversity/std": 0.004227474331855774, + "rewards/symbolic_reward_accuracy/mean": 0.2578125, + "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, + "rewards/symbolic_reward_partial_score/mean": 0.8018391728401184, + "rewards/symbolic_reward_partial_score/std": 0.16659002006053925, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0630683898925781, + "sampling/importance_sampling_ratio/min": 0.0012960727326571941, + "sampling/sampling_logp_difference/max": 6.648416519165039, + "sampling/sampling_logp_difference/mean": 0.12232454121112823, + "step": 1681 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2841361165046692, + "epoch": 2.6955128205128203, + "grad_norm": 0.013175196945667267, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2792322635650635, + "epoch": 2.6971153846153846, + "grad_norm": 0.008740157820284367, + "learning_rate": 1e-06, + "loss": -0.0037, + "step": 1683 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2852344512939453, + "epoch": 2.698717948717949, + "grad_norm": 0.007328695617616177, + "learning_rate": 1e-06, + "loss": 0.0062, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2764.0, + "completions/max_terminated_length": 2764.0, + "completions/mean_length": 1182.7734375, + "completions/mean_terminated_length": 1182.7734375, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.27371834218502045, + "epoch": 2.7003205128205128, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.016033172607421875, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 1069083168.0, + "reward": 0.3871532678604126, + "reward_std": 0.036548394709825516, + "rewards/progression_diversity/mean": -8.401693776249886e-06, + "rewards/progression_diversity/std": 0.00019010863616131246, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.7592610716819763, + "rewards/symbolic_reward_partial_score/std": 0.18804891407489777, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0619139671325684, + "sampling/importance_sampling_ratio/min": 6.210852006915957e-05, + "sampling/sampling_logp_difference/max": 9.686627388000488, + "sampling/sampling_logp_difference/mean": 0.1205776035785675, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.27223040163517, + "epoch": 2.7019230769230766, + "grad_norm": 0.009487297385931015, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 1686 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2768988460302353, + "epoch": 2.703525641025641, + "grad_norm": 0.010462704114615917, + "learning_rate": 1e-06, + "loss": 0.0031, + "step": 1687 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2717527002096176, + "epoch": 2.7051282051282053, + "grad_norm": 0.016962876543402672, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2341.0, + "completions/max_terminated_length": 2341.0, + "completions/mean_length": 1154.96875, + "completions/mean_terminated_length": 1154.96875, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.2763988673686981, + "epoch": 2.706730769230769, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.011929732747375965, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 1070536032.0, + "reward": 0.3324165344238281, + "reward_std": 0.020372124388813972, + "rewards/progression_diversity/mean": -4.5338445488596335e-05, + "rewards/progression_diversity/std": 0.0006139642791822553, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.7408691644668579, + "rewards/symbolic_reward_partial_score/std": 0.18128949403762817, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0632294416427612, + "sampling/importance_sampling_ratio/min": 0.0005881256074644625, + "sampling/sampling_logp_difference/max": 7.438570022583008, + "sampling/sampling_logp_difference/mean": 0.12137885391712189, + "step": 1689 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.27255162596702576, + "epoch": 2.7083333333333335, + "grad_norm": 0.02167939394712448, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 1690 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.28303536772727966, + "epoch": 2.7099358974358974, + "grad_norm": 0.008452474139630795, + "learning_rate": 1e-06, + "loss": -0.0025, + "step": 1691 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2690129578113556, + "epoch": 2.7115384615384617, + "grad_norm": 0.010433075949549675, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2530.0, + "completions/mean_length": 1084.271484375, + "completions/mean_terminated_length": 1054.3306884765625, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.2840660959482193, + "epoch": 2.7131410256410255, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.007845147512853146, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 1071857355.0, + "reward": 0.41788747906684875, + "reward_std": 0.011897813528776169, + "rewards/progression_diversity/mean": -0.0008051774930208921, + "rewards/progression_diversity/std": 0.01665707863867283, + "rewards/symbolic_reward_accuracy/mean": 0.310546875, + "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, + "rewards/symbolic_reward_partial_score/mean": 0.7718912363052368, + "rewards/symbolic_reward_partial_score/std": 0.1966981589794159, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0603790283203125, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.6770976781845093, + "step": 1693 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.27424998581409454, + "epoch": 2.71474358974359, + "grad_norm": 0.011158179491758347, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 1694 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.279265433549881, + "epoch": 2.7163461538461537, + "grad_norm": 0.012044363655149937, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 1695 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.27725885808467865, + "epoch": 2.717948717948718, + "grad_norm": 0.004128993488848209, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2449.0, + "completions/mean_length": 1174.107421875, + "completions/mean_terminated_length": 1084.461669921875, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.2545585185289383, + "epoch": 2.719551282051282, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.01764233596622944, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 1073390098.0, + "reward": 0.3306075632572174, + "reward_std": 0.017754849046468735, + "rewards/progression_diversity/mean": -0.002232456346973777, + "rewards/progression_diversity/std": 0.02855554409325123, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.731005847454071, + "rewards/symbolic_reward_partial_score/std": 0.17919674515724182, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0483648777008057, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 5.032663345336914, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2522525489330292, + "epoch": 2.7211538461538463, + "grad_norm": 0.014643147587776184, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1698 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.23972028493881226, + "epoch": 2.72275641025641, + "grad_norm": 0.014284703880548477, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 1699 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2528829574584961, + "epoch": 2.7243589743589745, + "grad_norm": 0.007763241417706013, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2125.0, + "completions/mean_length": 1085.763671875, + "completions/mean_terminated_length": 1055.8258056640625, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "entropy": 0.25211136788129807, + "epoch": 2.7259615384615383, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.01731860637664795, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 1074838889.0, + "reward": 0.37423521280288696, + "reward_std": 0.012984178960323334, + "rewards/progression_diversity/mean": -0.0007978131761774421, + "rewards/progression_diversity/std": 0.017717914655804634, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.7552896738052368, + "rewards/symbolic_reward_partial_score/std": 0.20005594193935394, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0550222396850586, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.6980559825897217, + "step": 1701 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.24782519042491913, + "epoch": 2.7275641025641026, + "grad_norm": 0.01143115945160389, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1702 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.25386030972003937, + "epoch": 2.7291666666666665, + "grad_norm": 0.010584347881376743, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1703 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.25097061693668365, + "epoch": 2.730769230769231, + "grad_norm": 0.014512878842651844, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1766.0, + "completions/mean_length": 1077.181640625, + "completions/mean_terminated_length": 1017.1549682617188, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.2463693991303444, + "epoch": 2.7323717948717947, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.022319326177239418, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 1076451366.0, + "reward": 0.3169002830982208, + "reward_std": 0.018300164490938187, + "rewards/progression_diversity/mean": -0.0013787832576781511, + "rewards/progression_diversity/std": 0.02173839509487152, + "rewards/symbolic_reward_accuracy/mean": 0.16796875, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.7204427123069763, + "rewards/symbolic_reward_partial_score/std": 0.20151053369045258, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0520621538162231, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.0996155738830566, + "step": 1705 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24724167585372925, + "epoch": 2.733974358974359, + "grad_norm": 465.19256591796875, + "learning_rate": 1e-06, + "loss": 0.0165, + "step": 1706 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.25137076526880264, + "epoch": 2.7355769230769234, + "grad_norm": 0.011684092693030834, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 1707 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.24099694192409515, + "epoch": 2.7371794871794872, + "grad_norm": 0.01180903147906065, + "learning_rate": 1e-06, + "loss": -0.0007, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2634.0, + "completions/mean_length": 1115.263671875, + "completions/mean_terminated_length": 1025.2711181640625, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.24923396110534668, + "epoch": 2.738782051282051, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.022832239046692848, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 1077919117.0, + "reward": 0.4506452679634094, + "reward_std": 0.006873677484691143, + "rewards/progression_diversity/mean": -0.002368855057284236, + "rewards/progression_diversity/std": 0.02794456295669079, + "rewards/symbolic_reward_accuracy/mean": 0.34375, + "rewards/symbolic_reward_accuracy/std": 0.4754233956336975, + "rewards/symbolic_reward_partial_score/mean": 0.8147298097610474, + "rewards/symbolic_reward_partial_score/std": 0.17687095701694489, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0485377311706543, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 4.422557353973389, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.24275388568639755, + "epoch": 2.7403846153846154, + "grad_norm": 0.020597660914063454, + "learning_rate": 1e-06, + "loss": 0.0452, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.25289829075336456, + "epoch": 2.7419871794871797, + "grad_norm": 0.005713851656764746, + "learning_rate": 1e-06, + "loss": -0.0012, + "step": 1711 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2434333637356758, + "epoch": 2.7435897435897436, + "grad_norm": 0.010084899142384529, + "learning_rate": 1e-06, + "loss": 0.0226, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1117.982421875, + "completions/mean_terminated_length": 1088.107666015625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 0.24176087975502014, + "epoch": 2.7451923076923075, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.014847025275230408, + "learning_rate": 1e-06, + "loss": -0.023, + "num_tokens": 1079362852.0, + "reward": 0.4161057472229004, + "reward_std": 0.020435160025954247, + "rewards/progression_diversity/mean": -0.0012444315943866968, + "rewards/progression_diversity/std": 0.017949236556887627, + "rewards/symbolic_reward_accuracy/mean": 0.302734375, + "rewards/symbolic_reward_accuracy/std": 0.45989060401916504, + "rewards/symbolic_reward_partial_score/mean": 0.7815917730331421, + "rewards/symbolic_reward_partial_score/std": 0.19830654561519623, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.052306890487671, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.7721519470214844, + "step": 1713 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2418888807296753, + "epoch": 2.746794871794872, + "grad_norm": 0.005131000652909279, + "learning_rate": 1e-06, + "loss": -0.0005, + "step": 1714 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24066882580518723, + "epoch": 2.748397435897436, + "grad_norm": 0.012241684831678867, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 1715 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2462095245718956, + "epoch": 2.75, + "grad_norm": 0.011896181851625443, + "learning_rate": 1e-06, + "loss": -0.0003, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1122.0, + "completions/mean_terminated_length": 1062.1490478515625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.24476610869169235, + "epoch": 2.751602564102564, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.011216786690056324, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 1080775556.0, + "reward": 0.31833428144454956, + "reward_std": 0.009690433740615845, + "rewards/progression_diversity/mean": -0.0015309633454307914, + "rewards/progression_diversity/std": 0.02212413214147091, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.7486653923988342, + "rewards/symbolic_reward_partial_score/std": 0.18320314586162567, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.050942063331604, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.9385156631469727, + "step": 1717 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.23835279792547226, + "epoch": 2.753205128205128, + "grad_norm": 0.012194461189210415, + "learning_rate": 1e-06, + "loss": 0.232, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.24551533162593842, + "epoch": 2.7548076923076925, + "grad_norm": 0.015713347122073174, + "learning_rate": 1e-06, + "loss": 0.0023, + "step": 1719 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.25484059751033783, + "epoch": 2.7564102564102564, + "grad_norm": 0.01567044109106064, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2034.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 970.169921875, + "completions/mean_terminated_length": 970.169921875, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.249983012676239, + "epoch": 2.7580128205128203, + "frac_reward_zero_std": 0.78125, + "grad_norm": 0.010865814052522182, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 1082279163.0, + "reward": 0.2930664122104645, + "reward_std": 0.005064056254923344, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.7268880009651184, + "rewards/symbolic_reward_partial_score/std": 0.17721883952617645, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0566366910934448, + "sampling/importance_sampling_ratio/min": 0.0014420977095142007, + "sampling/sampling_logp_difference/max": 6.541656494140625, + "sampling/sampling_logp_difference/mean": 0.11134675145149231, + "step": 1721 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.25236815214157104, + "epoch": 2.7596153846153846, + "grad_norm": 0.0043575153686106205, + "learning_rate": 1e-06, + "loss": -0.0021, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.25090380012989044, + "epoch": 2.761217948717949, + "grad_norm": 0.00688148895278573, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 1723 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.2414015531539917, + "epoch": 2.7628205128205128, + "grad_norm": 0.0044527724385261536, + "learning_rate": 1e-06, + "loss": 0.002, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1888.0, + "completions/mean_length": 1074.3984375, + "completions/mean_terminated_length": 1044.4383544921875, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.26541852951049805, + "epoch": 2.7644230769230766, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.007640301249921322, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 1083616935.0, + "reward": 0.39704516530036926, + "reward_std": 0.01641342230141163, + "rewards/progression_diversity/mean": -0.0005633344408124685, + "rewards/progression_diversity/std": 0.01171040628105402, + "rewards/symbolic_reward_accuracy/mean": 0.2890625, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.7453775405883789, + "rewards/symbolic_reward_partial_score/std": 0.20832401514053345, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0565519332885742, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.417606234550476, + "step": 1725 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.26599733531475067, + "epoch": 2.766025641025641, + "grad_norm": 0.016951909288764, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 1726 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.25999362766742706, + "epoch": 2.7676282051282053, + "grad_norm": 1.5784918069839478, + "learning_rate": 1e-06, + "loss": 0.0046, + "step": 1727 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2638109028339386, + "epoch": 2.769230769230769, + "grad_norm": 0.009875231422483921, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2051.0, + "completions/mean_length": 1121.234375, + "completions/mean_terminated_length": 1061.3804931640625, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "entropy": 0.2469266802072525, + "epoch": 2.7708333333333335, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.012807636521756649, + "learning_rate": 1e-06, + "loss": 0.0239, + "num_tokens": 1085117135.0, + "reward": 0.3859965205192566, + "reward_std": 0.007197174243628979, + "rewards/progression_diversity/mean": -0.0014215593691915274, + "rewards/progression_diversity/std": 0.021102240309119225, + "rewards/symbolic_reward_accuracy/mean": 0.28125, + "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, + "rewards/symbolic_reward_partial_score/mean": 0.7242025136947632, + "rewards/symbolic_reward_partial_score/std": 0.20905202627182007, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0523045063018799, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 2.6319713592529297, + "step": 1729 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2536063939332962, + "epoch": 2.7724358974358974, + "grad_norm": 0.009693522937595844, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 1730 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.24922660738229752, + "epoch": 2.7740384615384617, + "grad_norm": 0.0067869494669139385, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 1731 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.25212302803993225, + "epoch": 2.7756410256410255, + "grad_norm": 0.003368781413882971, + "learning_rate": 1e-06, + "loss": -0.0037, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1915.0, + "completions/mean_length": 1387.89453125, + "completions/mean_terminated_length": 1149.8612060546875, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "entropy": 0.2443920373916626, + "epoch": 2.77724358974359, + "frac_reward_zero_std": 0.5, + "grad_norm": 510.38726806640625, + "learning_rate": 1e-06, + "loss": 0.0371, + "num_tokens": 1086770873.0, + "reward": 0.28071457147598267, + "reward_std": 0.026247015222907066, + "rewards/progression_diversity/mean": -0.0052025578916072845, + "rewards/progression_diversity/std": 0.04079189524054527, + "rewards/symbolic_reward_accuracy/mean": 0.115234375, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.7054198980331421, + "rewards/symbolic_reward_partial_score/std": 0.18301890790462494, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0395216941833496, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 8.083383560180664, + "step": 1733 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.23890351504087448, + "epoch": 2.7788461538461537, + "grad_norm": 0.005275333765894175, + "learning_rate": 1e-06, + "loss": 0.0321, + "step": 1734 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2496597245335579, + "epoch": 2.780448717948718, + "grad_norm": 0.018677234649658203, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.23257097601890564, + "epoch": 2.782051282051282, + "grad_norm": 0.006251712329685688, + "learning_rate": 1e-06, + "loss": 0.0448, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2008.0, + "completions/mean_length": 1218.24609375, + "completions/mean_terminated_length": 1128.860595703125, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 0.25235116481781006, + "epoch": 2.7836538461538463, + "frac_reward_zero_std": 0.5625, + "grad_norm": 480.960693359375, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 1088287719.0, + "reward": 0.30648308992385864, + "reward_std": 0.03177080303430557, + "rewards/progression_diversity/mean": -0.0020833953749388456, + "rewards/progression_diversity/std": 0.02671230398118496, + "rewards/symbolic_reward_accuracy/mean": 0.150390625, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.7208983898162842, + "rewards/symbolic_reward_partial_score/std": 0.19757305085659027, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0507006645202637, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.8391146659851074, + "step": 1737 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2659503221511841, + "epoch": 2.78525641025641, + "grad_norm": 0.006545082200318575, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 1738 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.24851246178150177, + "epoch": 2.7868589743589745, + "grad_norm": 0.011122615076601505, + "learning_rate": 1e-06, + "loss": 0.0275, + "step": 1739 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.26005537807941437, + "epoch": 2.7884615384615383, + "grad_norm": 0.01145921926945448, + "learning_rate": 1e-06, + "loss": -0.0071, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3001.0, + "completions/mean_length": 1462.2109375, + "completions/mean_terminated_length": 1164.9642333984375, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.24173830449581146, + "epoch": 2.7900641025641026, + "frac_reward_zero_std": 0.375, + "grad_norm": 95.36475372314453, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 1089973347.0, + "reward": 0.2687704563140869, + "reward_std": 0.03094436414539814, + "rewards/progression_diversity/mean": -0.007230142131447792, + "rewards/progression_diversity/std": 0.051848724484443665, + "rewards/symbolic_reward_accuracy/mean": 0.08984375, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.7164551019668579, + "rewards/symbolic_reward_partial_score/std": 0.19291111826896667, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.028411865234375, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 14.061389923095703, + "step": 1741 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24212167412042618, + "epoch": 2.7916666666666665, + "grad_norm": 0.023332608863711357, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 1742 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.24151303619146347, + "epoch": 2.793269230769231, + "grad_norm": 0.02178528904914856, + "learning_rate": 1e-06, + "loss": 0.0041, + "step": 1743 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2449488863348961, + "epoch": 2.7948717948717947, + "grad_norm": 0.00785065721720457, + "learning_rate": 1e-06, + "loss": 0.0433, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2171.0, + "completions/mean_length": 1198.900390625, + "completions/mean_terminated_length": 1139.35107421875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.2584023177623749, + "epoch": 2.796474358974359, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.017909616231918335, + "learning_rate": 1e-06, + "loss": 0.0197, + "num_tokens": 1091436528.0, + "reward": 0.4526541233062744, + "reward_std": 0.034945450723171234, + "rewards/progression_diversity/mean": -0.0016795285046100616, + "rewards/progression_diversity/std": 0.023316482082009315, + "rewards/symbolic_reward_accuracy/mean": 0.37109375, + "rewards/symbolic_reward_accuracy/std": 0.4835699498653412, + "rewards/symbolic_reward_partial_score/mean": 0.7667155265808105, + "rewards/symbolic_reward_partial_score/std": 0.22165250778198242, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0529496669769287, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 3.4778664112091064, + "step": 1745 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.25567905604839325, + "epoch": 2.7980769230769234, + "grad_norm": 1.9885333776474, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 1746 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.26071760058403015, + "epoch": 2.7996794871794872, + "grad_norm": 0.01138362381607294, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 1747 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2582208067178726, + "epoch": 2.801282051282051, + "grad_norm": 0.01637045480310917, + "learning_rate": 1e-06, + "loss": 0.0035, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2307.0, + "completions/mean_length": 1168.337890625, + "completions/mean_terminated_length": 1078.658203125, + "completions/min_length": 308.0, + "completions/min_terminated_length": 308.0, + "entropy": 0.26668301224708557, + "epoch": 2.8028846153846154, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.014873064123094082, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 1092912237.0, + "reward": 0.3330345153808594, + "reward_std": 0.009103953838348389, + "rewards/progression_diversity/mean": -0.0026997888926416636, + "rewards/progression_diversity/std": 0.03452081233263016, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.7358561158180237, + "rewards/symbolic_reward_partial_score/std": 0.1908014863729477, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0509870052337646, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 4.842784881591797, + "step": 1749 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.26296547055244446, + "epoch": 2.8044871794871797, + "grad_norm": 0.014554469846189022, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 1750 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.25916317105293274, + "epoch": 2.8060897435897436, + "grad_norm": 0.012337586842477322, + "learning_rate": 1e-06, + "loss": 0.0315, + "step": 1751 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2664582431316376, + "epoch": 2.8076923076923075, + "grad_norm": 0.006437097210437059, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2030.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1078.322265625, + "completions/mean_terminated_length": 1078.322265625, + "completions/min_length": 334.0, + "completions/min_terminated_length": 334.0, + "entropy": 0.26388947665691376, + "epoch": 2.809294871794872, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.024757713079452515, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 1094326178.0, + "reward": 0.39402249455451965, + "reward_std": 0.020639346912503242, + "rewards/progression_diversity/mean": -9.485271584708244e-05, + "rewards/progression_diversity/std": 0.0016233575297519565, + "rewards/symbolic_reward_accuracy/mean": 0.275390625, + "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, + "rewards/symbolic_reward_partial_score/mean": 0.7626302242279053, + "rewards/symbolic_reward_partial_score/std": 0.1950385868549347, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0604701042175293, + "sampling/importance_sampling_ratio/min": 0.00016087631229311228, + "sampling/sampling_logp_difference/max": 8.734874725341797, + "sampling/sampling_logp_difference/mean": 0.11699830740690231, + "step": 1753 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.26401379704475403, + "epoch": 2.810897435897436, + "grad_norm": 0.006932654418051243, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 1754 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.2620585560798645, + "epoch": 2.8125, + "grad_norm": 0.00885852426290512, + "learning_rate": 1e-06, + "loss": 0.0047, + "step": 1755 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.26447711884975433, + "epoch": 2.814102564102564, + "grad_norm": 0.008724554441869259, + "learning_rate": 1e-06, + "loss": -0.005, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2611.0, + "completions/mean_length": 1278.806640625, + "completions/mean_terminated_length": 1099.6937255859375, + "completions/min_length": 360.0, + "completions/min_terminated_length": 360.0, + "entropy": 0.2583252787590027, + "epoch": 2.815705128205128, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.008783966302871704, + "learning_rate": 1e-06, + "loss": 0.0674, + "num_tokens": 1095875567.0, + "reward": 0.38711655139923096, + "reward_std": 0.0257473886013031, + "rewards/progression_diversity/mean": -0.006121981889009476, + "rewards/progression_diversity/std": 0.05701431632041931, + "rewards/symbolic_reward_accuracy/mean": 0.27734375, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.7378580570220947, + "rewards/symbolic_reward_partial_score/std": 0.21705490350723267, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0447036027908325, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 7.182675838470459, + "step": 1757 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.273300439119339, + "epoch": 2.8173076923076925, + "grad_norm": 0.01463186927139759, + "learning_rate": 1e-06, + "loss": -0.0055, + "step": 1758 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.2602022588253021, + "epoch": 2.8189102564102564, + "grad_norm": 0.013082200661301613, + "learning_rate": 1e-06, + "loss": 0.0554, + "step": 1759 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.26440712809562683, + "epoch": 2.8205128205128203, + "grad_norm": 0.010828658938407898, + "learning_rate": 1e-06, + "loss": 0.0194, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.001953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2079.0, + "completions/mean_length": 1091.5390625, + "completions/mean_terminated_length": 1061.612548828125, + "completions/min_length": 333.0, + "completions/min_terminated_length": 333.0, + "entropy": 0.28021863102912903, + "epoch": 2.8221153846153846, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.00968155823647976, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 1097272771.0, + "reward": 0.33711838722229004, + "reward_std": 0.00528107350692153, + "rewards/progression_diversity/mean": -0.0010524296667426825, + "rewards/progression_diversity/std": 0.02093338593840599, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.7487630248069763, + "rewards/symbolic_reward_partial_score/std": 0.18481901288032532, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0589430332183838, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 1.9359979629516602, + "step": 1761 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2707153856754303, + "epoch": 2.823717948717949, + "grad_norm": 0.007914133369922638, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2671675831079483, + "epoch": 2.8253205128205128, + "grad_norm": 0.016453703865408897, + "learning_rate": 1e-06, + "loss": 0.4248, + "step": 1763 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.27130477130413055, + "epoch": 2.8269230769230766, + "grad_norm": 0.009142329916357994, + "learning_rate": 1e-06, + "loss": -0.0026, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1321.56640625, + "completions/mean_terminated_length": 1052.0595703125, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "entropy": 0.25689977407455444, + "epoch": 2.828525641025641, + "frac_reward_zero_std": 0.40625, + "grad_norm": 857.88134765625, + "learning_rate": 1e-06, + "loss": 0.0615, + "num_tokens": 1098881909.0, + "reward": 0.3542296588420868, + "reward_std": 0.029991699382662773, + "rewards/progression_diversity/mean": -0.008187920786440372, + "rewards/progression_diversity/std": 0.060944657772779465, + "rewards/symbolic_reward_accuracy/mean": 0.234375, + "rewards/symbolic_reward_accuracy/std": 0.42402184009552, + "rewards/symbolic_reward_partial_score/mean": 0.712939441204071, + "rewards/symbolic_reward_partial_score/std": 0.2108824998140335, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0325154066085815, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 13.213213920593262, + "step": 1765 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2575305104255676, + "epoch": 2.8301282051282053, + "grad_norm": 0.018781762570142746, + "learning_rate": 1e-06, + "loss": 0.0015, + "step": 1766 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2592518925666809, + "epoch": 2.831730769230769, + "grad_norm": 0.010047774761915207, + "learning_rate": 1e-06, + "loss": 0.0387, + "step": 1767 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2506992667913437, + "epoch": 2.8333333333333335, + "grad_norm": 0.012858088128268719, + "learning_rate": 1e-06, + "loss": 0.0435, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1814.0, + "completions/mean_length": 1162.033203125, + "completions/mean_terminated_length": 1072.3162841796875, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.25953665375709534, + "epoch": 2.8349358974358974, + "frac_reward_zero_std": 0.625, + "grad_norm": 402.998046875, + "learning_rate": 1e-06, + "loss": 0.0247, + "num_tokens": 1100398246.0, + "reward": 0.3336543142795563, + "reward_std": 0.019593775272369385, + "rewards/progression_diversity/mean": -0.002245709067210555, + "rewards/progression_diversity/std": 0.029325280338525772, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.7450683116912842, + "rewards/symbolic_reward_partial_score/std": 0.1828693449497223, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0506747961044312, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 4.43061637878418, + "step": 1769 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2642872482538223, + "epoch": 2.8365384615384617, + "grad_norm": 0.010171863250434399, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 1770 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.26166392862796783, + "epoch": 2.8381410256410255, + "grad_norm": 0.009490652941167355, + "learning_rate": 1e-06, + "loss": 0.0014, + "step": 1771 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2598698139190674, + "epoch": 2.83974358974359, + "grad_norm": 0.010799145326018333, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2031.0, + "completions/mean_length": 1231.958984375, + "completions/mean_terminated_length": 1052.29052734375, + "completions/min_length": 339.0, + "completions/min_terminated_length": 339.0, + "entropy": 0.2563736140727997, + "epoch": 2.8413461538461537, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.015560018830001354, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 1102002417.0, + "reward": 0.27941030263900757, + "reward_std": 0.020234711468219757, + "rewards/progression_diversity/mean": -0.005260556936264038, + "rewards/progression_diversity/std": 0.04788186773657799, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.6821939945220947, + "rewards/symbolic_reward_partial_score/std": 0.20531851053237915, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0375680923461914, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 9.631587028503418, + "step": 1773 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.23910512030124664, + "epoch": 2.842948717948718, + "grad_norm": 0.008363538421690464, + "learning_rate": 1e-06, + "loss": 0.0743, + "step": 1774 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.24283453822135925, + "epoch": 2.844551282051282, + "grad_norm": 49193.4765625, + "learning_rate": 1e-06, + "loss": 1.9294, + "step": 1775 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2511303126811981, + "epoch": 2.8461538461538463, + "grad_norm": 0.016176847741007805, + "learning_rate": 1e-06, + "loss": -0.005, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1908.0, + "completions/mean_length": 1422.087890625, + "completions/mean_terminated_length": 1124.0418701171875, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 0.2574124187231064, + "epoch": 2.84775641025641, + "frac_reward_zero_std": 0.4375, + "grad_norm": 261.94879150390625, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 1103595310.0, + "reward": 0.3236244320869446, + "reward_std": 0.023357607424259186, + "rewards/progression_diversity/mean": -0.0086508272215724, + "rewards/progression_diversity/std": 0.062096692621707916, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.7079427242279053, + "rewards/symbolic_reward_partial_score/std": 0.21444584429264069, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.028921127319336, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 764.0, + "sampling/sampling_logp_difference/mean": 14.721025466918945, + "step": 1777 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2630213350057602, + "epoch": 2.8493589743589745, + "grad_norm": 0.015530991367995739, + "learning_rate": 1e-06, + "loss": 0.0129, + "step": 1778 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.25482146441936493, + "epoch": 2.8509615384615383, + "grad_norm": 2.474681854248047, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 1779 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.24518878757953644, + "epoch": 2.8525641025641026, + "grad_norm": 122.15824890136719, + "learning_rate": 1e-06, + "loss": 0.073, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1969.0, + "completions/mean_length": 1393.974609375, + "completions/mean_terminated_length": 1125.763427734375, + "completions/min_length": 419.0, + "completions/min_terminated_length": 419.0, + "entropy": 0.24117282778024673, + "epoch": 2.8541666666666665, + "frac_reward_zero_std": 0.4375, + "grad_norm": 894.6455688476562, + "learning_rate": 1e-06, + "loss": 0.0781, + "num_tokens": 1105132289.0, + "reward": 0.3265843987464905, + "reward_std": 0.029642684385180473, + "rewards/progression_diversity/mean": -0.007576026022434235, + "rewards/progression_diversity/std": 0.05679089576005936, + "rewards/symbolic_reward_accuracy/mean": 0.181640625, + "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, + "rewards/symbolic_reward_partial_score/mean": 0.7262369394302368, + "rewards/symbolic_reward_partial_score/std": 0.19816945493221283, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0353561639785767, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 11.549863815307617, + "step": 1781 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.26401175558567047, + "epoch": 2.855769230769231, + "grad_norm": 0.0201826523989439, + "learning_rate": 1e-06, + "loss": -0.0045, + "step": 1782 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2670938968658447, + "epoch": 2.8573717948717947, + "grad_norm": 0.00789049081504345, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 1783 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2526419162750244, + "epoch": 2.858974358974359, + "grad_norm": 5.186265468597412, + "learning_rate": 1e-06, + "loss": 0.0221, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2701.0, + "completions/mean_length": 1483.630859375, + "completions/mean_terminated_length": 1186.810791015625, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.2505309730768204, + "epoch": 2.8605769230769234, + "frac_reward_zero_std": 0.40625, + "grad_norm": 22.291501998901367, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 1106690644.0, + "reward": 0.33161357045173645, + "reward_std": 0.03396005928516388, + "rewards/progression_diversity/mean": -0.008566973730921745, + "rewards/progression_diversity/std": 0.061884794384241104, + "rewards/symbolic_reward_accuracy/mean": 0.17578125, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.7560546398162842, + "rewards/symbolic_reward_partial_score/std": 0.18065473437309265, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0316046476364136, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 12.537492752075195, + "step": 1785 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.23839128017425537, + "epoch": 2.8621794871794872, + "grad_norm": 0.012131825089454651, + "learning_rate": 1e-06, + "loss": 0.1023, + "step": 1786 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2580532729625702, + "epoch": 2.863782051282051, + "grad_norm": 0.007426050491631031, + "learning_rate": 1e-06, + "loss": 0.0244, + "step": 1787 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.258169487118721, + "epoch": 2.8653846153846154, + "grad_norm": 0.017984015867114067, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2401.0, + "completions/mean_length": 1486.4765625, + "completions/mean_terminated_length": 1189.713134765625, + "completions/min_length": 377.0, + "completions/min_terminated_length": 377.0, + "entropy": 0.23781320452690125, + "epoch": 2.8669871794871797, + "frac_reward_zero_std": 0.46875, + "grad_norm": 482.7618713378906, + "learning_rate": 1e-06, + "loss": 0.0416, + "num_tokens": 1108318152.0, + "reward": 0.32290422916412354, + "reward_std": 0.04777061939239502, + "rewards/progression_diversity/mean": -0.008405622094869614, + "rewards/progression_diversity/std": 0.060591381043195724, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.765429675579071, + "rewards/symbolic_reward_partial_score/std": 0.16949279606342316, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0300309658050537, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 12.117807388305664, + "step": 1789 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.24075324833393097, + "epoch": 2.8685897435897436, + "grad_norm": 0.012247085571289062, + "learning_rate": 1e-06, + "loss": 0.0271, + "step": 1790 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.23328307271003723, + "epoch": 2.8701923076923075, + "grad_norm": 0.010141503997147083, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1791 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.23753970116376877, + "epoch": 2.871794871794872, + "grad_norm": 0.014962738379836082, + "learning_rate": 1e-06, + "loss": 0.0154, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3149.0, + "completions/mean_length": 1426.01953125, + "completions/mean_terminated_length": 1188.59130859375, + "completions/min_length": 372.0, + "completions/min_terminated_length": 372.0, + "entropy": 0.23268520832061768, + "epoch": 2.873397435897436, + "frac_reward_zero_std": 0.375, + "grad_norm": 389.8380432128906, + "learning_rate": 1e-06, + "loss": 0.042, + "num_tokens": 1109934338.0, + "reward": 0.4080950617790222, + "reward_std": 0.04739544540643692, + "rewards/progression_diversity/mean": -0.005925622768700123, + "rewards/progression_diversity/std": 0.04622631147503853, + "rewards/symbolic_reward_accuracy/mean": 0.29296875, + "rewards/symbolic_reward_accuracy/std": 0.455569326877594, + "rewards/symbolic_reward_partial_score/mean": 0.7745768427848816, + "rewards/symbolic_reward_partial_score/std": 0.19915655255317688, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0313217639923096, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 11.25175666809082, + "step": 1793 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.23954074829816818, + "epoch": 2.875, + "grad_norm": 0.014046828262507915, + "learning_rate": 1e-06, + "loss": 0.0466, + "step": 1794 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24329328536987305, + "epoch": 2.876602564102564, + "grad_norm": 0.012684703804552555, + "learning_rate": 1e-06, + "loss": -0.0077, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.23238325119018555, + "epoch": 2.878205128205128, + "grad_norm": 0.020547054708003998, + "learning_rate": 1e-06, + "loss": 0.0476, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1598.23828125, + "completions/mean_terminated_length": 1213.0380859375, + "completions/min_length": 426.0, + "completions/min_terminated_length": 426.0, + "entropy": 0.24311521649360657, + "epoch": 2.8798076923076925, + "frac_reward_zero_std": 0.4375, + "grad_norm": 125.55624389648438, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 1111646460.0, + "reward": 0.3389037549495697, + "reward_std": 0.025876596570014954, + "rewards/progression_diversity/mean": -0.0070867519825696945, + "rewards/progression_diversity/std": 0.045192137360572815, + "rewards/symbolic_reward_accuracy/mean": 0.19921875, + "rewards/symbolic_reward_accuracy/std": 0.39980348944664, + "rewards/symbolic_reward_partial_score/mean": 0.7314778566360474, + "rewards/symbolic_reward_partial_score/std": 0.1943960338830948, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0247465372085571, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 14.375904083251953, + "step": 1797 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.23341339081525803, + "epoch": 2.8814102564102564, + "grad_norm": 0.018049802631139755, + "learning_rate": 1e-06, + "loss": 0.1091, + "step": 1798 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2326470911502838, + "epoch": 2.8830128205128203, + "grad_norm": 0.00664905272424221, + "learning_rate": 1e-06, + "loss": 0.0569, + "step": 1799 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2430347353219986, + "epoch": 2.8846153846153846, + "grad_norm": 0.00851722713559866, + "learning_rate": 1e-06, + "loss": 0.0132, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2379.0, + "completions/mean_length": 1422.78515625, + "completions/mean_terminated_length": 1155.08935546875, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.24775538593530655, + "epoch": 2.886217948717949, + "frac_reward_zero_std": 0.5, + "grad_norm": 916.2537841796875, + "learning_rate": 1e-06, + "loss": 0.0312, + "num_tokens": 1113154542.0, + "reward": 0.4589552879333496, + "reward_std": 0.024608189240098, + "rewards/progression_diversity/mean": -0.0053527336567640305, + "rewards/progression_diversity/std": 0.040160540491342545, + "rewards/symbolic_reward_accuracy/mean": 0.361328125, + "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, + "rewards/symbolic_reward_partial_score/mean": 0.8086751699447632, + "rewards/symbolic_reward_partial_score/std": 0.18015848100185394, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0337169170379639, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 10.59257698059082, + "step": 1801 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.24687150865793228, + "epoch": 2.8878205128205128, + "grad_norm": 829.5130615234375, + "learning_rate": 1e-06, + "loss": 0.0824, + "step": 1802 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.25254474580287933, + "epoch": 2.8894230769230766, + "grad_norm": 2157.5517578125, + "learning_rate": 1e-06, + "loss": 0.3136, + "step": 1803 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.24701299518346786, + "epoch": 2.891025641025641, + "grad_norm": 0.011435293592512608, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2067.0, + "completions/mean_length": 1403.126953125, + "completions/mean_terminated_length": 1195.4713134765625, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "entropy": 0.23765964061021805, + "epoch": 2.8926282051282053, + "frac_reward_zero_std": 0.4375, + "grad_norm": 398.1752014160156, + "learning_rate": 1e-06, + "loss": 0.0265, + "num_tokens": 1114716159.0, + "reward": 0.28466612100601196, + "reward_std": 0.02723667398095131, + "rewards/progression_diversity/mean": -0.004093291237950325, + "rewards/progression_diversity/std": 0.03477524220943451, + "rewards/symbolic_reward_accuracy/mean": 0.123046875, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.7035807371139526, + "rewards/symbolic_reward_partial_score/std": 0.18481120467185974, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0375463962554932, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 8.100292205810547, + "step": 1805 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.24171607196331024, + "epoch": 2.894230769230769, + "grad_norm": 16.580183029174805, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 1806 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2345980554819107, + "epoch": 2.8958333333333335, + "grad_norm": 0.02494910918176174, + "learning_rate": 1e-06, + "loss": 0.0728, + "step": 1807 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2448808252811432, + "epoch": 2.8974358974358974, + "grad_norm": 0.008014152757823467, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3170.0, + "completions/mean_length": 1560.744140625, + "completions/mean_terminated_length": 1204.986083984375, + "completions/min_length": 486.0, + "completions/min_terminated_length": 486.0, + "entropy": 0.2300274297595024, + "epoch": 2.8990384615384617, + "frac_reward_zero_std": 0.46875, + "grad_norm": 1100.1624755859375, + "learning_rate": 1e-06, + "loss": 0.0962, + "num_tokens": 1116373148.0, + "reward": 0.3304210603237152, + "reward_std": 0.0337834395468235, + "rewards/progression_diversity/mean": -0.007212708704173565, + "rewards/progression_diversity/std": 0.04686911776661873, + "rewards/symbolic_reward_accuracy/mean": 0.197265625, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.7084146738052368, + "rewards/symbolic_reward_partial_score/std": 0.2101738452911377, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0344150066375732, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 9.156167030334473, + "step": 1809 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2424936443567276, + "epoch": 2.9006410256410255, + "grad_norm": 0.019458921626210213, + "learning_rate": 1e-06, + "loss": -0.0052, + "step": 1810 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.23142275214195251, + "epoch": 2.90224358974359, + "grad_norm": 0.010163613595068455, + "learning_rate": 1e-06, + "loss": 0.2442, + "step": 1811 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24503599107265472, + "epoch": 2.9038461538461537, + "grad_norm": 0.008980165235698223, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2119.0, + "completions/mean_length": 1808.796875, + "completions/mean_terminated_length": 1185.4176025390625, + "completions/min_length": 324.0, + "completions/min_terminated_length": 324.0, + "entropy": 0.23406819999217987, + "epoch": 2.905448717948718, + "frac_reward_zero_std": 0.375, + "grad_norm": 73.40351867675781, + "learning_rate": 1e-06, + "loss": 0.0672, + "num_tokens": 1118158052.0, + "reward": 0.42395147681236267, + "reward_std": 0.051604464650154114, + "rewards/progression_diversity/mean": -0.012569092214107513, + "rewards/progression_diversity/std": 0.06235016882419586, + "rewards/symbolic_reward_accuracy/mean": 0.330078125, + "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, + "rewards/symbolic_reward_partial_score/mean": 0.7547363042831421, + "rewards/symbolic_reward_partial_score/std": 0.2372274249792099, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0196201801300049, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 15.569795608520508, + "step": 1813 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.222855344414711, + "epoch": 2.907051282051282, + "grad_norm": 1161.0462646484375, + "learning_rate": 1e-06, + "loss": 63.2965, + "step": 1814 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2360314056277275, + "epoch": 2.9086538461538463, + "grad_norm": 173744.71875, + "learning_rate": 1e-06, + "loss": 10.0443, + "step": 1815 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2379593700170517, + "epoch": 2.91025641025641, + "grad_norm": 0.012492096051573753, + "learning_rate": 1e-06, + "loss": 0.0198, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2021.0, + "completions/mean_length": 1775.76171875, + "completions/mean_terminated_length": 1212.7667236328125, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "entropy": 0.22906029224395752, + "epoch": 2.9118589743589745, + "frac_reward_zero_std": 0.28125, + "grad_norm": 874.5888671875, + "learning_rate": 1e-06, + "loss": 0.0862, + "num_tokens": 1119985066.0, + "reward": 0.31860923767089844, + "reward_std": 0.04233834519982338, + "rewards/progression_diversity/mean": -0.011634357273578644, + "rewards/progression_diversity/std": 0.059071075171232224, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.7160645127296448, + "rewards/symbolic_reward_partial_score/std": 0.22811061143875122, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0222280025482178, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 15.096211433410645, + "step": 1817 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.23098096251487732, + "epoch": 2.9134615384615383, + "grad_norm": 0.011047611013054848, + "learning_rate": 1e-06, + "loss": 0.0751, + "step": 1818 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.22944733500480652, + "epoch": 2.9150641025641026, + "grad_norm": 0.008758867159485817, + "learning_rate": 1e-06, + "loss": 0.0656, + "step": 1819 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.23350029438734055, + "epoch": 2.9166666666666665, + "grad_norm": 0.009284427389502525, + "learning_rate": 1e-06, + "loss": 0.0732, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2415.0, + "completions/mean_length": 1809.248046875, + "completions/mean_terminated_length": 1216.7784423828125, + "completions/min_length": 396.0, + "completions/min_terminated_length": 396.0, + "entropy": 0.23025192320346832, + "epoch": 2.918269230769231, + "frac_reward_zero_std": 0.28125, + "grad_norm": 926.9324951171875, + "learning_rate": 1e-06, + "loss": 0.0657, + "num_tokens": 1121805241.0, + "reward": 0.36059021949768066, + "reward_std": 0.031355924904346466, + "rewards/progression_diversity/mean": -0.0122678866609931, + "rewards/progression_diversity/std": 0.062373507767915726, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.7114908695220947, + "rewards/symbolic_reward_partial_score/std": 0.22253166139125824, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0174920558929443, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 764.0, + "sampling/sampling_logp_difference/mean": 17.77764892578125, + "step": 1821 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23264270275831223, + "epoch": 2.9198717948717947, + "grad_norm": 0.008918493054807186, + "learning_rate": 1e-06, + "loss": 0.0674, + "step": 1822 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.22309153527021408, + "epoch": 2.921474358974359, + "grad_norm": 0.0191388800740242, + "learning_rate": 1e-06, + "loss": 0.1132, + "step": 1823 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2341051548719406, + "epoch": 2.9230769230769234, + "grad_norm": 0.009794514626264572, + "learning_rate": 1e-06, + "loss": 0.0327, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2205.0, + "completions/mean_length": 1469.75, + "completions/mean_terminated_length": 1142.2913818359375, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "entropy": 0.24822328984737396, + "epoch": 2.9246794871794872, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.017211588099598885, + "learning_rate": 1e-06, + "loss": -0.0109, + "num_tokens": 1123429833.0, + "reward": 0.3736184239387512, + "reward_std": 0.024019591510295868, + "rewards/progression_diversity/mean": -0.007301281206309795, + "rewards/progression_diversity/std": 0.050444815307855606, + "rewards/symbolic_reward_accuracy/mean": 0.248046875, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.7514973878860474, + "rewards/symbolic_reward_partial_score/std": 0.1985999196767807, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0336096286773682, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 764.0, + "sampling/sampling_logp_difference/mean": 10.840036392211914, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.23915088176727295, + "epoch": 2.926282051282051, + "grad_norm": 0.015742426738142967, + "learning_rate": 1e-06, + "loss": 0.0593, + "step": 1826 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.23261761665344238, + "epoch": 2.9278846153846154, + "grad_norm": 0.004535183776170015, + "learning_rate": 1e-06, + "loss": 0.0502, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.23625855147838593, + "epoch": 2.9294871794871797, + "grad_norm": 0.011525583453476429, + "learning_rate": 1e-06, + "loss": 0.0662, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1917.0, + "completions/mean_length": 1803.416015625, + "completions/mean_terminated_length": 1210.7093505859375, + "completions/min_length": 412.0, + "completions/min_terminated_length": 412.0, + "entropy": 0.22059651464223862, + "epoch": 2.9310897435897436, + "frac_reward_zero_std": 0.34375, + "grad_norm": 478.7748718261719, + "learning_rate": 1e-06, + "loss": 0.1023, + "num_tokens": 1125201182.0, + "reward": 0.32501205801963806, + "reward_std": 0.03538961708545685, + "rewards/progression_diversity/mean": -0.011980684474110603, + "rewards/progression_diversity/std": 0.05950572341680527, + "rewards/symbolic_reward_accuracy/mean": 0.166015625, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.752392590045929, + "rewards/symbolic_reward_partial_score/std": 0.18106764554977417, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.018181324005127, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 764.0, + "sampling/sampling_logp_difference/mean": 17.928653717041016, + "step": 1829 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.22678129374980927, + "epoch": 2.9326923076923075, + "grad_norm": 0.018833013251423836, + "learning_rate": 1e-06, + "loss": 0.0311, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.22647760808467865, + "epoch": 2.934294871794872, + "grad_norm": 17.264345169067383, + "learning_rate": 1e-06, + "loss": 0.0067, + "step": 1831 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.21786439418792725, + "epoch": 2.935897435897436, + "grad_norm": 0.15827719867229462, + "learning_rate": 1e-06, + "loss": 0.0264, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2052.0, + "completions/mean_length": 2331.796875, + "completions/mean_terminated_length": 1173.1585693359375, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "entropy": 0.21573705226182938, + "epoch": 2.9375, + "frac_reward_zero_std": 0.1875, + "grad_norm": 590.8438110351562, + "learning_rate": 1e-06, + "loss": 0.0605, + "num_tokens": 1127233478.0, + "reward": 0.3646172285079956, + "reward_std": 0.0675165057182312, + "rewards/progression_diversity/mean": -0.02313985861837864, + "rewards/progression_diversity/std": 0.08131039142608643, + "rewards/symbolic_reward_accuracy/mean": 0.2265625, + "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, + "rewards/symbolic_reward_partial_score/mean": 0.7630370855331421, + "rewards/symbolic_reward_partial_score/std": 0.2085689753293991, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808230996131897, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 764.0, + "sampling/sampling_logp_difference/mean": 36.245513916015625, + "step": 1833 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2271483689546585, + "epoch": 2.939102564102564, + "grad_norm": 0.014887535013258457, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1834 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.21168261021375656, + "epoch": 2.940705128205128, + "grad_norm": 0.017205113545060158, + "learning_rate": 1e-06, + "loss": 0.1176, + "step": 1835 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.20247046649456024, + "epoch": 2.9423076923076925, + "grad_norm": 0.014779583550989628, + "learning_rate": 1e-06, + "loss": 0.1315, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2314.0, + "completions/mean_length": 2300.94921875, + "completions/mean_terminated_length": 1171.92822265625, + "completions/min_length": 449.0, + "completions/min_terminated_length": 449.0, + "entropy": 0.20949921756982803, + "epoch": 2.9439102564102564, + "frac_reward_zero_std": 0.34375, + "grad_norm": 870.5208740234375, + "learning_rate": 1e-06, + "loss": 0.0751, + "num_tokens": 1129310636.0, + "reward": 0.45326170325279236, + "reward_std": 0.044423650950193405, + "rewards/progression_diversity/mean": -0.02392907440662384, + "rewards/progression_diversity/std": 0.08498639613389969, + "rewards/symbolic_reward_accuracy/mean": 0.365234375, + "rewards/symbolic_reward_accuracy/std": 0.4819667339324951, + "rewards/symbolic_reward_partial_score/mean": 0.781201183795929, + "rewards/symbolic_reward_partial_score/std": 0.22568704187870026, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9865758419036865, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 764.0, + "sampling/sampling_logp_difference/mean": 33.854530334472656, + "step": 1837 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.21899111568927765, + "epoch": 2.9455128205128203, + "grad_norm": 0.012719093821942806, + "learning_rate": 1e-06, + "loss": 0.0941, + "step": 1838 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.22411292791366577, + "epoch": 2.9471153846153846, + "grad_norm": 0.021860910579562187, + "learning_rate": 1e-06, + "loss": 0.0825, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2084907665848732, + "epoch": 2.948717948717949, + "grad_norm": 0.005877522751688957, + "learning_rate": 1e-06, + "loss": 0.0637, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2297.0, + "completions/mean_length": 2534.21484375, + "completions/mean_terminated_length": 1232.098388671875, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.20938124507665634, + "epoch": 2.9503205128205128, + "frac_reward_zero_std": 0.125, + "grad_norm": 560.36669921875, + "learning_rate": 1e-06, + "loss": 0.12, + "num_tokens": 1131555370.0, + "reward": 0.32735151052474976, + "reward_std": 0.05115168169140816, + "rewards/progression_diversity/mean": -0.03047451376914978, + "rewards/progression_diversity/std": 0.09939130395650864, + "rewards/symbolic_reward_accuracy/mean": 0.19140625, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.7093750238418579, + "rewards/symbolic_reward_partial_score/std": 0.2335452288389206, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9635010361671448, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 45.37931442260742, + "step": 1841 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2120969444513321, + "epoch": 2.9519230769230766, + "grad_norm": 381.89239501953125, + "learning_rate": 1e-06, + "loss": 821.5478, + "step": 1842 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2079833373427391, + "epoch": 2.953525641025641, + "grad_norm": 0.010464330203831196, + "learning_rate": 1e-06, + "loss": 32.2227, + "step": 1843 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.21436134725809097, + "epoch": 2.9551282051282053, + "grad_norm": 0.02022792212665081, + "learning_rate": 1e-06, + "loss": 0.1652, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.14453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2207.0, + "completions/mean_length": 3449.130859375, + "completions/mean_terminated_length": 1263.78759765625, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "entropy": 0.21006950736045837, + "epoch": 2.956730769230769, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1419.5054931640625, + "learning_rate": 1e-06, + "loss": 0.1083, + "num_tokens": 1134191421.0, + "reward": 0.2933524250984192, + "reward_std": 0.04929710179567337, + "rewards/progression_diversity/mean": -0.05147593468427658, + "rewards/progression_diversity/std": 0.12628963589668274, + "rewards/symbolic_reward_accuracy/mean": 0.150390625, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.6800781488418579, + "rewards/symbolic_reward_partial_score/std": 0.23385514318943024, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9397692680358887, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 57.66311264038086, + "step": 1845 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.1966499239206314, + "epoch": 2.9583333333333335, + "grad_norm": 16499.4140625, + "learning_rate": 1e-06, + "loss": 5.3166, + "step": 1846 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.19310440123081207, + "epoch": 2.9599358974358974, + "grad_norm": 4365.603515625, + "learning_rate": 1e-06, + "loss": 0.5053, + "step": 1847 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.1920364871621132, + "epoch": 2.9615384615384617, + "grad_norm": 0.009575553238391876, + "learning_rate": 1e-06, + "loss": 0.1589, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.12890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2035.0, + "completions/mean_length": 3208.90625, + "completions/mean_terminated_length": 1259.228759765625, + "completions/min_length": 470.0, + "completions/min_terminated_length": 470.0, + "entropy": 0.2075439766049385, + "epoch": 2.9631410256410255, + "frac_reward_zero_std": 0.09375, + "grad_norm": 574.5317993164062, + "learning_rate": 1e-06, + "loss": 0.0662, + "num_tokens": 1136791005.0, + "reward": 0.28958579897880554, + "reward_std": 0.05835293233394623, + "rewards/progression_diversity/mean": -0.04337439686059952, + "rewards/progression_diversity/std": 0.11332310736179352, + "rewards/symbolic_reward_accuracy/mean": 0.142578125, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.6835286617279053, + "rewards/symbolic_reward_partial_score/std": 0.22093914449214935, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9411171674728394, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 56.884559631347656, + "step": 1849 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.19637862592935562, + "epoch": 2.96474358974359, + "grad_norm": 0.008334570564329624, + "learning_rate": 1e-06, + "loss": 0.1807, + "step": 1850 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.390625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.20494883507490158, + "epoch": 2.9663461538461537, + "grad_norm": 0.007710518781095743, + "learning_rate": 1e-06, + "loss": 0.1235, + "step": 1851 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.18237695842981339, + "epoch": 2.967948717948718, + "grad_norm": 0.011762432754039764, + "learning_rate": 1e-06, + "loss": 0.1953, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2194.0, + "completions/mean_length": 2733.078125, + "completions/mean_terminated_length": 1255.70556640625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.2183959111571312, + "epoch": 2.969551282051282, + "frac_reward_zero_std": 0.125, + "grad_norm": 1045.5770263671875, + "learning_rate": 1e-06, + "loss": 0.0877, + "num_tokens": 1139114341.0, + "reward": 0.33328425884246826, + "reward_std": 0.06576712429523468, + "rewards/progression_diversity/mean": -0.030461102724075317, + "rewards/progression_diversity/std": 0.09234879910945892, + "rewards/symbolic_reward_accuracy/mean": 0.203125, + "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, + "rewards/symbolic_reward_partial_score/mean": 0.7057129144668579, + "rewards/symbolic_reward_partial_score/std": 0.23553626239299774, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779849052429199, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 39.03363800048828, + "step": 1853 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2225719690322876, + "epoch": 2.9711538461538463, + "grad_norm": 0.00991111621260643, + "learning_rate": 1e-06, + "loss": 0.1348, + "step": 1854 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.19753701984882355, + "epoch": 2.97275641025641, + "grad_norm": 0.006589308846741915, + "learning_rate": 1e-06, + "loss": 0.145, + "step": 1855 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2158377543091774, + "epoch": 2.9743589743589745, + "grad_norm": 0.009189927950501442, + "learning_rate": 1e-06, + "loss": 0.1266, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2904.0, + "completions/mean_length": 2554.814453125, + "completions/mean_terminated_length": 1254.6346435546875, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.20936352014541626, + "epoch": 2.9759615384615383, + "frac_reward_zero_std": 0.1875, + "grad_norm": 463.4955139160156, + "learning_rate": 1e-06, + "loss": 0.0787, + "num_tokens": 1141373350.0, + "reward": 0.3864889144897461, + "reward_std": 0.0557912215590477, + "rewards/progression_diversity/mean": -0.02347256988286972, + "rewards/progression_diversity/std": 0.07694897800683975, + "rewards/symbolic_reward_accuracy/mean": 0.2734375, + "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, + "rewards/symbolic_reward_partial_score/mean": 0.7422037720680237, + "rewards/symbolic_reward_partial_score/std": 0.24117407202720642, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877347946166992, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 33.34986114501953, + "step": 1857 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.22580835223197937, + "epoch": 2.9775641025641026, + "grad_norm": 0.01121189258992672, + "learning_rate": 1e-06, + "loss": 0.0885, + "step": 1858 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2194606065750122, + "epoch": 2.9791666666666665, + "grad_norm": 0.018390899524092674, + "learning_rate": 1e-06, + "loss": 0.1005, + "step": 1859 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.20080887526273727, + "epoch": 2.980769230769231, + "grad_norm": 0.009253687225282192, + "learning_rate": 1e-06, + "loss": 0.1674, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2504.0, + "completions/mean_length": 2508.169921875, + "completions/mean_terminated_length": 1268.2021484375, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "entropy": 0.23013577610254288, + "epoch": 2.9823717948717947, + "frac_reward_zero_std": 0.1875, + "grad_norm": 397.2406311035156, + "learning_rate": 1e-06, + "loss": 0.0351, + "num_tokens": 1143492301.0, + "reward": 0.39299777150154114, + "reward_std": 0.05584446340799332, + "rewards/progression_diversity/mean": -0.023467358201742172, + "rewards/progression_diversity/std": 0.07824753224849701, + "rewards/symbolic_reward_accuracy/mean": 0.291015625, + "rewards/symbolic_reward_accuracy/std": 0.45467492938041687, + "rewards/symbolic_reward_partial_score/mean": 0.7287434935569763, + "rewards/symbolic_reward_partial_score/std": 0.24250133335590363, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9919546842575073, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 31.701099395751953, + "step": 1861 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2217669039964676, + "epoch": 2.983974358974359, + "grad_norm": 53.626285552978516, + "learning_rate": 1e-06, + "loss": 0.0267, + "step": 1862 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.21721980720758438, + "epoch": 2.9855769230769234, + "grad_norm": 0.032945115119218826, + "learning_rate": 1e-06, + "loss": 0.172, + "step": 1863 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.22124967724084854, + "epoch": 2.9871794871794872, + "grad_norm": 0.01755567640066147, + "learning_rate": 1e-06, + "loss": 0.093, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2574.0, + "completions/mean_length": 2252.9453125, + "completions/mean_terminated_length": 1279.4071044921875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "entropy": 0.2342042252421379, + "epoch": 2.988782051282051, + "frac_reward_zero_std": 0.28125, + "grad_norm": 481.52252197265625, + "learning_rate": 1e-06, + "loss": 0.1885, + "num_tokens": 1145451937.0, + "reward": 0.3523566424846649, + "reward_std": 0.04059898108243942, + "rewards/progression_diversity/mean": -0.01775522157549858, + "rewards/progression_diversity/std": 0.06716568768024445, + "rewards/symbolic_reward_accuracy/mean": 0.212890625, + "rewards/symbolic_reward_accuracy/std": 0.409751296043396, + "rewards/symbolic_reward_partial_score/mean": 0.7493326663970947, + "rewards/symbolic_reward_partial_score/std": 0.22399716079235077, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0028358697891235, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 27.260162353515625, + "step": 1865 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.24164949357509613, + "epoch": 2.9903846153846154, + "grad_norm": 14.552428245544434, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 1866 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23062748461961746, + "epoch": 2.9919871794871797, + "grad_norm": 0.030370866879820824, + "learning_rate": 1e-06, + "loss": 0.1004, + "step": 1867 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.24224235862493515, + "epoch": 2.9935897435897436, + "grad_norm": 0.011348499916493893, + "learning_rate": 1e-06, + "loss": 0.1191, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2234.0, + "completions/mean_length": 2649.326171875, + "completions/mean_terminated_length": 1261.09033203125, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.24135691672563553, + "epoch": 2.9951923076923075, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1024.800537109375, + "learning_rate": 1e-06, + "loss": 0.0707, + "num_tokens": 1147683176.0, + "reward": 0.3426709771156311, + "reward_std": 0.04791571944952011, + "rewards/progression_diversity/mean": -0.025381293147802353, + "rewards/progression_diversity/std": 0.07976257801055908, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.7212076187133789, + "rewards/symbolic_reward_partial_score/std": 0.21549275517463684, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9791022539138794, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 38.175758361816406, + "step": 1869 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2204393520951271, + "epoch": 2.996794871794872, + "grad_norm": 1905.4779052734375, + "learning_rate": 1e-06, + "loss": 0.2076, + "step": 1870 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.21573010832071304, + "epoch": 2.998397435897436, + "grad_norm": 12149.6005859375, + "learning_rate": 1e-06, + "loss": 1.1185, + "step": 1871 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.2214333862066269, + "epoch": 3.0, + "grad_norm": 0.018644830211997032, + "learning_rate": 1e-06, + "loss": 0.1475, + "step": 1872 + }, + { + "epoch": 3.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.080078125, + "eval_completions/max_length": 16384.0, + "eval_completions/max_terminated_length": 2229.46875, + "eval_completions/mean_length": 2536.551025390625, + "eval_completions/mean_terminated_length": 1331.3045349121094, + "eval_completions/min_length": 594.8125, + "eval_completions/min_terminated_length": 594.8125, + "eval_entropy": 0.20870172325521708, + "eval_frac_reward_zero_std": 0.1796875, + "eval_loss": 0.04187625274062157, + "eval_num_tokens": 1147683176.0, + "eval_reward": 0.2534480579197407, + "eval_reward_std": 0.0388854734483175, + "eval_rewards/progression_diversity/mean": -0.021589503419818357, + "eval_rewards/progression_diversity/std": 0.07232485536951572, + "eval_rewards/symbolic_reward_accuracy/mean": 0.09130859375, + "eval_rewards/symbolic_reward_accuracy/std": 0.2087636678479612, + "eval_rewards/symbolic_reward_partial_score/mean": 0.6634989399462938, + "eval_rewards/symbolic_reward_partial_score/std": 0.1970880120061338, + "eval_rewards/tag_count_reward/mean": -0.001708984375, + "eval_rewards/tag_count_reward/std": 0.015794883016496897, + "eval_runtime": 4301.5149, + "eval_samples_per_second": 0.058, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.021749384701252, + "eval_sampling/importance_sampling_ratio/min": 0.0, + "eval_sampling/sampling_logp_difference/max": 768.0, + "eval_sampling/sampling_logp_difference/mean": 19.073409140110016, + "eval_steps_per_second": 0.0, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2393.0, + "completions/mean_length": 2402.98046875, + "completions/mean_terminated_length": 1313.9326171875, + "completions/min_length": 359.0, + "completions/min_terminated_length": 359.0, + "entropy": 0.23216699063777924, + "epoch": 3.0016025641025643, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1125.79931640625, + "learning_rate": 1e-06, + "loss": 0.0865, + "num_tokens": 1149811486.0, + "reward": 0.3975224494934082, + "reward_std": 0.04312722384929657, + "rewards/progression_diversity/mean": -0.02021792158484459, + "rewards/progression_diversity/std": 0.07212448120117188, + "rewards/symbolic_reward_accuracy/mean": 0.3046875, + "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, + "rewards/symbolic_reward_partial_score/mean": 0.7163736820220947, + "rewards/symbolic_reward_partial_score/std": 0.23637627065181732, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9978500604629517, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 30.130069732666016, + "step": 1873 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.23934829235076904, + "epoch": 3.003205128205128, + "grad_norm": 0.007034912705421448, + "learning_rate": 1e-06, + "loss": 0.1083, + "step": 1874 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.22745058685541153, + "epoch": 3.0048076923076925, + "grad_norm": 0.00796093326061964, + "learning_rate": 1e-06, + "loss": 0.0727, + "step": 1875 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2292134240269661, + "epoch": 3.0064102564102564, + "grad_norm": 0.019052157178521156, + "learning_rate": 1e-06, + "loss": 0.1367, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2208.0, + "completions/mean_length": 1881.35546875, + "completions/mean_terminated_length": 1322.429931640625, + "completions/min_length": 434.0, + "completions/min_terminated_length": 434.0, + "entropy": 0.2549893334507942, + "epoch": 3.0080128205128207, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.024593627080321312, + "learning_rate": 1e-06, + "loss": 0.0391, + "num_tokens": 1151633332.0, + "reward": 0.2892531454563141, + "reward_std": 0.039773762226104736, + "rewards/progression_diversity/mean": -0.010232724249362946, + "rewards/progression_diversity/std": 0.055212657898664474, + "rewards/symbolic_reward_accuracy/mean": 0.119140625, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.7275390625, + "rewards/symbolic_reward_partial_score/std": 0.19641563296318054, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0221221446990967, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 19.10765838623047, + "step": 1877 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.24741330742835999, + "epoch": 3.0096153846153846, + "grad_norm": 0.01947159692645073, + "learning_rate": 1e-06, + "loss": 0.1312, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24799786508083344, + "epoch": 3.011217948717949, + "grad_norm": 0.02338356338441372, + "learning_rate": 1e-06, + "loss": 0.0404, + "step": 1879 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2448197305202484, + "epoch": 3.0128205128205128, + "grad_norm": 0.014209832064807415, + "learning_rate": 1e-06, + "loss": 0.0894, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3167.0, + "completions/mean_length": 1858.625, + "completions/mean_terminated_length": 1359.7738037109375, + "completions/min_length": 445.0, + "completions/min_terminated_length": 445.0, + "entropy": 0.25408633053302765, + "epoch": 3.014423076923077, + "frac_reward_zero_std": 0.375, + "grad_norm": 454.2471618652344, + "learning_rate": 1e-06, + "loss": 0.0447, + "num_tokens": 1153465092.0, + "reward": 0.3250887095928192, + "reward_std": 0.015932850539684296, + "rewards/progression_diversity/mean": -0.011149164289236069, + "rewards/progression_diversity/std": 0.05865437537431717, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.7096517086029053, + "rewards/symbolic_reward_partial_score/std": 0.19842353463172913, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.028539776802063, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 17.165746688842773, + "step": 1881 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2505646124482155, + "epoch": 3.016025641025641, + "grad_norm": 0.6798326373100281, + "learning_rate": 1e-06, + "loss": 0.0431, + "step": 1882 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.24388013035058975, + "epoch": 3.0176282051282053, + "grad_norm": 1.9514085054397583, + "learning_rate": 1e-06, + "loss": 0.1241, + "step": 1883 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.26686759293079376, + "epoch": 3.019230769230769, + "grad_norm": 0.012397652491927147, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2108.0, + "completions/mean_length": 1794.703125, + "completions/mean_terminated_length": 1414.6212158203125, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "entropy": 0.2557114362716675, + "epoch": 3.0208333333333335, + "frac_reward_zero_std": 0.53125, + "grad_norm": 544.6851196289062, + "learning_rate": 1e-06, + "loss": 0.0255, + "num_tokens": 1155145164.0, + "reward": 0.37460383772850037, + "reward_std": 0.016295883804559708, + "rewards/progression_diversity/mean": -0.008367680944502354, + "rewards/progression_diversity/std": 0.05215869098901749, + "rewards/symbolic_reward_accuracy/mean": 0.25, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.7502604126930237, + "rewards/symbolic_reward_partial_score/std": 0.1986854523420334, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.030568242073059, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 16.76722526550293, + "step": 1885 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.26103954017162323, + "epoch": 3.0224358974358974, + "grad_norm": 91.75881958007812, + "learning_rate": 1e-06, + "loss": 0.2128, + "step": 1886 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2607679069042206, + "epoch": 3.0240384615384617, + "grad_norm": 0.012660062871873379, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 1887 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.26138727366924286, + "epoch": 3.0256410256410255, + "grad_norm": 0.01078207790851593, + "learning_rate": 1e-06, + "loss": 0.0361, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2198.0, + "completions/mean_length": 1951.89453125, + "completions/mean_terminated_length": 1395.6876220703125, + "completions/min_length": 593.0, + "completions/min_terminated_length": 593.0, + "entropy": 0.2467678338289261, + "epoch": 3.02724358974359, + "frac_reward_zero_std": 0.25, + "grad_norm": 572.4014282226562, + "learning_rate": 1e-06, + "loss": 0.0566, + "num_tokens": 1157118710.0, + "reward": 0.226592555642128, + "reward_std": 0.021917399019002914, + "rewards/progression_diversity/mean": -0.01262015849351883, + "rewards/progression_diversity/std": 0.06260491162538528, + "rewards/symbolic_reward_accuracy/mean": 0.05859375, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.6385416984558105, + "rewards/symbolic_reward_partial_score/std": 0.20352140069007874, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0226848125457764, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 19.33092498779297, + "step": 1889 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.23328816145658493, + "epoch": 3.0288461538461537, + "grad_norm": 20633.263671875, + "learning_rate": 1e-06, + "loss": 4.1374, + "step": 1890 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2414126992225647, + "epoch": 3.030448717948718, + "grad_norm": 6596.80908203125, + "learning_rate": 1e-06, + "loss": 0.5508, + "step": 1891 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2591097354888916, + "epoch": 3.032051282051282, + "grad_norm": 0.01172653678804636, + "learning_rate": 1e-06, + "loss": 0.0182, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2715.0, + "completions/mean_length": 1600.69921875, + "completions/mean_terminated_length": 1366.043701171875, + "completions/min_length": 407.0, + "completions/min_terminated_length": 407.0, + "entropy": 0.26757121086120605, + "epoch": 3.0336538461538463, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.026514563709497452, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 1158840796.0, + "reward": 0.22522126138210297, + "reward_std": 0.04094214737415314, + "rewards/progression_diversity/mean": -0.005220310762524605, + "rewards/progression_diversity/std": 0.04069770127534866, + "rewards/symbolic_reward_accuracy/mean": 0.0390625, + "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, + "rewards/symbolic_reward_partial_score/mean": 0.6740885972976685, + "rewards/symbolic_reward_partial_score/std": 0.1974918395280838, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041705846786499, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 10.610610008239746, + "step": 1893 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.25377361476421356, + "epoch": 3.03525641025641, + "grad_norm": 10585.2099609375, + "learning_rate": 1e-06, + "loss": 0.5272, + "step": 1894 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.26954758167266846, + "epoch": 3.0368589743589745, + "grad_norm": 0.022427378222346306, + "learning_rate": 1e-06, + "loss": -0.0047, + "step": 1895 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2479778230190277, + "epoch": 3.0384615384615383, + "grad_norm": 0.013517378829419613, + "learning_rate": 1e-06, + "loss": 0.2005, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2292.0, + "completions/mean_length": 1637.05859375, + "completions/mean_terminated_length": 1373.19677734375, + "completions/min_length": 425.0, + "completions/min_terminated_length": 425.0, + "entropy": 0.2555150091648102, + "epoch": 3.0400641025641026, + "frac_reward_zero_std": 0.5, + "grad_norm": 30.618288040161133, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 1160526426.0, + "reward": 0.4024108052253723, + "reward_std": 0.02461879327893257, + "rewards/progression_diversity/mean": -0.005499871913343668, + "rewards/progression_diversity/std": 0.04200981557369232, + "rewards/symbolic_reward_accuracy/mean": 0.275390625, + "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, + "rewards/symbolic_reward_partial_score/mean": 0.790771484375, + "rewards/symbolic_reward_partial_score/std": 0.18546874821186066, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0372226238250732, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 12.745841979980469, + "step": 1897 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2539917603135109, + "epoch": 3.0416666666666665, + "grad_norm": 44032.421875, + "learning_rate": 1e-06, + "loss": 1.0033, + "step": 1898 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.25875651836395264, + "epoch": 3.043269230769231, + "grad_norm": 0.02200913429260254, + "learning_rate": 1e-06, + "loss": 0.0217, + "step": 1899 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.25009360164403915, + "epoch": 3.0448717948717947, + "grad_norm": 0.015072772279381752, + "learning_rate": 1e-06, + "loss": 0.0296, + "step": 1900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2233.0, + "completions/mean_length": 1893.123046875, + "completions/mean_terminated_length": 1395.4566650390625, + "completions/min_length": 557.0, + "completions/min_terminated_length": 557.0, + "entropy": 0.25338225811719894, + "epoch": 3.046474358974359, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.014189253561198711, + "learning_rate": 1e-06, + "loss": 0.0664, + "num_tokens": 1162334969.0, + "reward": 0.31671467423439026, + "reward_std": 0.037882737815380096, + "rewards/progression_diversity/mean": -0.010173956863582134, + "rewards/progression_diversity/std": 0.055442675948143005, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.7090494632720947, + "rewards/symbolic_reward_partial_score/std": 0.20027847588062286, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.020569920539856, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 20.987850189208984, + "step": 1901 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.25274112075567245, + "epoch": 3.048076923076923, + "grad_norm": 0.008120290003716946, + "learning_rate": 1e-06, + "loss": 0.0418, + "step": 1902 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.24684013426303864, + "epoch": 3.0496794871794872, + "grad_norm": 0.010809239000082016, + "learning_rate": 1e-06, + "loss": 0.0503, + "step": 1903 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.25020767748355865, + "epoch": 3.051282051282051, + "grad_norm": 0.0168289951980114, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 1904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2925.0, + "completions/mean_length": 1549.30078125, + "completions/mean_terminated_length": 1343.67138671875, + "completions/min_length": 443.0, + "completions/min_terminated_length": 443.0, + "entropy": 0.2655075490474701, + "epoch": 3.0528846153846154, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02596384286880493, + "learning_rate": 1e-06, + "loss": 0.0209, + "num_tokens": 1164024995.0, + "reward": 0.3695905804634094, + "reward_std": 0.033132217824459076, + "rewards/progression_diversity/mean": -0.004322149325162172, + "rewards/progression_diversity/std": 0.03666413947939873, + "rewards/symbolic_reward_accuracy/mean": 0.244140625, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.7438313961029053, + "rewards/symbolic_reward_partial_score/std": 0.20166073739528656, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0442430973052979, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 9.987215995788574, + "step": 1905 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.25963981449604034, + "epoch": 3.0544871794871793, + "grad_norm": 140.0711669921875, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 1906 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.26139749586582184, + "epoch": 3.0560897435897436, + "grad_norm": 0.00938948430120945, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 1907 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2722902148962021, + "epoch": 3.0576923076923075, + "grad_norm": 0.011267850175499916, + "learning_rate": 1e-06, + "loss": 0.0042, + "step": 1908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2150.0, + "completions/mean_length": 1744.671875, + "completions/mean_terminated_length": 1333.1243896484375, + "completions/min_length": 520.0, + "completions/min_terminated_length": 520.0, + "entropy": 0.2618495374917984, + "epoch": 3.059294871794872, + "frac_reward_zero_std": 0.34375, + "grad_norm": 652.4515380859375, + "learning_rate": 1e-06, + "loss": 0.0214, + "num_tokens": 1165798987.0, + "reward": 0.4160562753677368, + "reward_std": 0.061300039291381836, + "rewards/progression_diversity/mean": -0.009119954891502857, + "rewards/progression_diversity/std": 0.05495789274573326, + "rewards/symbolic_reward_accuracy/mean": 0.314453125, + "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, + "rewards/symbolic_reward_partial_score/mean": 0.7589030265808105, + "rewards/symbolic_reward_partial_score/std": 0.2069811373949051, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0315742492675781, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 16.712371826171875, + "step": 1909 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2622825354337692, + "epoch": 3.0608974358974357, + "grad_norm": 0.021915055811405182, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 1910 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2576129883527756, + "epoch": 3.0625, + "grad_norm": 0.013906807638704777, + "learning_rate": 1e-06, + "loss": 0.0443, + "step": 1911 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2665242552757263, + "epoch": 3.064102564102564, + "grad_norm": 0.012783014215528965, + "learning_rate": 1e-06, + "loss": 0.0385, + "step": 1912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2434.0, + "completions/mean_length": 1683.671875, + "completions/mean_terminated_length": 1360.91015625, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "entropy": 0.25943586230278015, + "epoch": 3.065705128205128, + "frac_reward_zero_std": 0.25, + "grad_norm": 292.14520263671875, + "learning_rate": 1e-06, + "loss": 0.0416, + "num_tokens": 1167423891.0, + "reward": 0.37675100564956665, + "reward_std": 0.06550440192222595, + "rewards/progression_diversity/mean": -0.005566404201090336, + "rewards/progression_diversity/std": 0.03809665888547897, + "rewards/symbolic_reward_accuracy/mean": 0.2578125, + "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, + "rewards/symbolic_reward_partial_score/mean": 0.74169921875, + "rewards/symbolic_reward_partial_score/std": 0.21923932433128357, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.039749026298523, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 12.124536514282227, + "step": 1913 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.27193763852119446, + "epoch": 3.0673076923076925, + "grad_norm": 0.013166146352887154, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 1914 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2520969808101654, + "epoch": 3.0689102564102564, + "grad_norm": 3.3681511878967285, + "learning_rate": 1e-06, + "loss": 0.057, + "step": 1915 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.26810455322265625, + "epoch": 3.0705128205128207, + "grad_norm": 0.6854636073112488, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 1916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2121.0, + "completions/mean_length": 1847.66796875, + "completions/mean_terminated_length": 1348.4404296875, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "entropy": 0.261865496635437, + "epoch": 3.0721153846153846, + "frac_reward_zero_std": 0.25, + "grad_norm": 45.9178581237793, + "learning_rate": 1e-06, + "loss": -0.0052, + "num_tokens": 1169237785.0, + "reward": 0.32650455832481384, + "reward_std": 0.09211001545190811, + "rewards/progression_diversity/mean": -0.009701091796159744, + "rewards/progression_diversity/std": 0.052368562668561935, + "rewards/symbolic_reward_accuracy/mean": 0.169921875, + "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, + "rewards/symbolic_reward_partial_score/mean": 0.750781238079071, + "rewards/symbolic_reward_partial_score/std": 0.19588179886341095, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0248017311096191, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 18.72113800048828, + "step": 1917 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.24243663251399994, + "epoch": 3.073717948717949, + "grad_norm": 1099.10400390625, + "learning_rate": 1e-06, + "loss": 0.1447, + "step": 1918 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2528642266988754, + "epoch": 3.0753205128205128, + "grad_norm": 0.01181944552809, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 1919 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.25511983036994934, + "epoch": 3.076923076923077, + "grad_norm": 0.016715671867132187, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 1920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2228.0, + "completions/mean_length": 1621.16796875, + "completions/mean_terminated_length": 1327.087646484375, + "completions/min_length": 613.0, + "completions/min_terminated_length": 613.0, + "entropy": 0.2685047686100006, + "epoch": 3.078525641025641, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.027836158871650696, + "learning_rate": 1e-06, + "loss": 0.0567, + "num_tokens": 1170851103.0, + "reward": 0.36298680305480957, + "reward_std": 0.06454600393772125, + "rewards/progression_diversity/mean": -0.006008030381053686, + "rewards/progression_diversity/std": 0.04314760863780975, + "rewards/symbolic_reward_accuracy/mean": 0.228515625, + "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, + "rewards/symbolic_reward_partial_score/mean": 0.7537760734558105, + "rewards/symbolic_reward_partial_score/std": 0.1955997198820114, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0417566299438477, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 11.107623100280762, + "step": 1921 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.25865650177001953, + "epoch": 3.0801282051282053, + "grad_norm": 3558.986328125, + "learning_rate": 1e-06, + "loss": 0.0839, + "step": 1922 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2664439529180527, + "epoch": 3.081730769230769, + "grad_norm": 0.014974008314311504, + "learning_rate": 1e-06, + "loss": -0.005, + "step": 1923 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.25956542789936066, + "epoch": 3.0833333333333335, + "grad_norm": 0.009708881378173828, + "learning_rate": 1e-06, + "loss": 0.0435, + "step": 1924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 2362.03515625, + "completions/mean_terminated_length": 1301.550537109375, + "completions/min_length": 400.0, + "completions/min_terminated_length": 400.0, + "entropy": 0.24602339416742325, + "epoch": 3.0849358974358974, + "frac_reward_zero_std": 0.34375, + "grad_norm": 400.1549072265625, + "learning_rate": 1e-06, + "loss": 0.0486, + "num_tokens": 1172988833.0, + "reward": 0.3911093473434448, + "reward_std": 0.03172388672828674, + "rewards/progression_diversity/mean": -0.020903022959828377, + "rewards/progression_diversity/std": 0.0763976201415062, + "rewards/symbolic_reward_accuracy/mean": 0.2734375, + "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, + "rewards/symbolic_reward_partial_score/mean": 0.759472668170929, + "rewards/symbolic_reward_partial_score/std": 0.20224013924598694, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9978427886962891, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 31.93221664428711, + "step": 1925 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.25485508143901825, + "epoch": 3.0865384615384617, + "grad_norm": 0.007848401553928852, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 1926 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.23554416000843048, + "epoch": 3.0881410256410255, + "grad_norm": 0.011878496035933495, + "learning_rate": 1e-06, + "loss": 0.092, + "step": 1927 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.21972551941871643, + "epoch": 3.08974358974359, + "grad_norm": 0.0162891186773777, + "learning_rate": 1e-06, + "loss": 0.1825, + "step": 1928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2478.0, + "completions/mean_length": 3023.3359375, + "completions/mean_terminated_length": 1349.58251953125, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "entropy": 0.220417819917202, + "epoch": 3.0913461538461537, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1063.9591064453125, + "learning_rate": 1e-06, + "loss": 0.1881, + "num_tokens": 1175419421.0, + "reward": 0.32698261737823486, + "reward_std": 0.024275176227092743, + "rewards/progression_diversity/mean": -0.031721051782369614, + "rewards/progression_diversity/std": 0.09057775884866714, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.7316243648529053, + "rewards/symbolic_reward_partial_score/std": 0.19131198525428772, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9758140444755554, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 42.80847930908203, + "step": 1929 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2313474789261818, + "epoch": 3.092948717948718, + "grad_norm": 0.009462164714932442, + "learning_rate": 1e-06, + "loss": 0.0303, + "step": 1930 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.23852744698524475, + "epoch": 3.094551282051282, + "grad_norm": 0.00927420798689127, + "learning_rate": 1e-06, + "loss": 0.0868, + "step": 1931 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.21710015833377838, + "epoch": 3.0961538461538463, + "grad_norm": 0.015988515689969063, + "learning_rate": 1e-06, + "loss": 0.1895, + "step": 1932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2227.0, + "completions/mean_length": 2461.416015625, + "completions/mean_terminated_length": 1376.919921875, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 0.24317217618227005, + "epoch": 3.09775641025641, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1258.6036376953125, + "learning_rate": 1e-06, + "loss": 0.09, + "num_tokens": 1177535618.0, + "reward": 0.2823217809200287, + "reward_std": 0.015544566325843334, + "rewards/progression_diversity/mean": -0.020752854645252228, + "rewards/progression_diversity/std": 0.07482607662677765, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.6937174797058105, + "rewards/symbolic_reward_partial_score/std": 0.17785848677158356, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0054988861083984, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 28.267414093017578, + "step": 1933 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.24698983132839203, + "epoch": 3.0993589743589745, + "grad_norm": 2.280089855194092, + "learning_rate": 1e-06, + "loss": 0.063, + "step": 1934 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.24258919060230255, + "epoch": 3.1009615384615383, + "grad_norm": 0.01370900496840477, + "learning_rate": 1e-06, + "loss": 0.0806, + "step": 1935 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.24202165007591248, + "epoch": 3.1025641025641026, + "grad_norm": 0.010354790836572647, + "learning_rate": 1e-06, + "loss": 0.0415, + "step": 1936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2383.0, + "completions/mean_length": 2724.0078125, + "completions/mean_terminated_length": 1343.3204345703125, + "completions/min_length": 516.0, + "completions/min_terminated_length": 516.0, + "entropy": 0.23228947818279266, + "epoch": 3.1041666666666665, + "frac_reward_zero_std": 0.28125, + "grad_norm": 134.23583984375, + "learning_rate": 1e-06, + "loss": 0.0476, + "num_tokens": 1179819430.0, + "reward": 0.35509487986564636, + "reward_std": 0.032482314854860306, + "rewards/progression_diversity/mean": -0.02615930885076523, + "rewards/progression_diversity/std": 0.08310631662607193, + "rewards/symbolic_reward_accuracy/mean": 0.21484375, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.7567870616912842, + "rewards/symbolic_reward_partial_score/std": 0.19756251573562622, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9856817722320557, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 37.22113800048828, + "step": 1937 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.21816729754209518, + "epoch": 3.105769230769231, + "grad_norm": 22.10169792175293, + "learning_rate": 1e-06, + "loss": 0.1096, + "step": 1938 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.23560451716184616, + "epoch": 3.1073717948717947, + "grad_norm": 861.4119873046875, + "learning_rate": 1e-06, + "loss": 0.1194, + "step": 1939 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.237302266061306, + "epoch": 3.108974358974359, + "grad_norm": 0.013657595030963421, + "learning_rate": 1e-06, + "loss": 0.0515, + "step": 1940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2743.0, + "completions/mean_length": 2930.619140625, + "completions/mean_terminated_length": 1377.1785888671875, + "completions/min_length": 395.0, + "completions/min_terminated_length": 395.0, + "entropy": 0.23268328607082367, + "epoch": 3.110576923076923, + "frac_reward_zero_std": 0.25, + "grad_norm": 118.8416976928711, + "learning_rate": 1e-06, + "loss": 0.0817, + "num_tokens": 1182230323.0, + "reward": 0.37685173749923706, + "reward_std": 0.045152001082897186, + "rewards/progression_diversity/mean": -0.029186122119426727, + "rewards/progression_diversity/std": 0.08660390228033066, + "rewards/symbolic_reward_accuracy/mean": 0.2578125, + "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, + "rewards/symbolic_reward_partial_score/mean": 0.7434733510017395, + "rewards/symbolic_reward_partial_score/std": 0.20847611129283905, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9919402599334717, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 33.77556610107422, + "step": 1941 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2476828247308731, + "epoch": 3.1121794871794872, + "grad_norm": 2530.47900390625, + "learning_rate": 1e-06, + "loss": 0.222, + "step": 1942 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22099629044532776, + "epoch": 3.113782051282051, + "grad_norm": 0.0288047194480896, + "learning_rate": 1e-06, + "loss": 0.1299, + "step": 1943 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2267403081059456, + "epoch": 3.1153846153846154, + "grad_norm": 0.01607387140393257, + "learning_rate": 1e-06, + "loss": 0.1327, + "step": 1944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2399.0, + "completions/mean_length": 2772.017578125, + "completions/mean_terminated_length": 1396.182861328125, + "completions/min_length": 611.0, + "completions/min_terminated_length": 611.0, + "entropy": 0.2250806912779808, + "epoch": 3.1169871794871793, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1792.7398681640625, + "learning_rate": 1e-06, + "loss": 0.1175, + "num_tokens": 1184533004.0, + "reward": 0.28835493326187134, + "reward_std": 0.013276607729494572, + "rewards/progression_diversity/mean": -0.02485804632306099, + "rewards/progression_diversity/std": 0.07890177518129349, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.7126628160476685, + "rewards/symbolic_reward_partial_score/std": 0.17205090820789337, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.004730463027954, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 27.43810272216797, + "step": 1945 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2443278580904007, + "epoch": 3.1185897435897436, + "grad_norm": 4188.24609375, + "learning_rate": 1e-06, + "loss": 0.5792, + "step": 1946 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2495875358581543, + "epoch": 3.1201923076923075, + "grad_norm": 0.09753794223070145, + "learning_rate": 1e-06, + "loss": 0.0398, + "step": 1947 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24468916654586792, + "epoch": 3.121794871794872, + "grad_norm": 0.011647003702819347, + "learning_rate": 1e-06, + "loss": 0.0852, + "step": 1948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2344.0, + "completions/mean_length": 2688.154296875, + "completions/mean_terminated_length": 1432.45849609375, + "completions/min_length": 568.0, + "completions/min_terminated_length": 568.0, + "entropy": 0.2420196533203125, + "epoch": 3.123397435897436, + "frac_reward_zero_std": 0.28125, + "grad_norm": 359.50616455078125, + "learning_rate": 1e-06, + "loss": 0.1009, + "num_tokens": 1186778155.0, + "reward": 0.32234811782836914, + "reward_std": 0.037715356796979904, + "rewards/progression_diversity/mean": -0.022023096680641174, + "rewards/progression_diversity/std": 0.07346688210964203, + "rewards/symbolic_reward_accuracy/mean": 0.177734375, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.7197591066360474, + "rewards/symbolic_reward_partial_score/std": 0.20028026401996613, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001284122467041, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 28.891010284423828, + "step": 1949 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.24103453755378723, + "epoch": 3.125, + "grad_norm": 589.58740234375, + "learning_rate": 1e-06, + "loss": 2.001, + "step": 1950 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.24162401258945465, + "epoch": 3.126602564102564, + "grad_norm": 5572.74560546875, + "learning_rate": 1e-06, + "loss": 0.4189, + "step": 1951 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.234800323843956, + "epoch": 3.128205128205128, + "grad_norm": 475.8064270019531, + "learning_rate": 1e-06, + "loss": 0.2053, + "step": 1952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2726.0, + "completions/mean_length": 3146.59375, + "completions/mean_terminated_length": 1455.4713134765625, + "completions/min_length": 452.0, + "completions/min_terminated_length": 452.0, + "entropy": 0.2195385992527008, + "epoch": 3.1298076923076925, + "frac_reward_zero_std": 0.15625, + "grad_norm": 503.7859802246094, + "learning_rate": 1e-06, + "loss": 0.1427, + "num_tokens": 1189316267.0, + "reward": 0.269339382648468, + "reward_std": 0.020436681807041168, + "rewards/progression_diversity/mean": -0.027488097548484802, + "rewards/progression_diversity/std": 0.0782787874341011, + "rewards/symbolic_reward_accuracy/mean": 0.09765625, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.7034016847610474, + "rewards/symbolic_reward_partial_score/std": 0.17726561427116394, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874823093414307, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 35.55792999267578, + "step": 1953 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.25392740964889526, + "epoch": 3.1314102564102564, + "grad_norm": 6332.8427734375, + "learning_rate": 1e-06, + "loss": 0.1834, + "step": 1954 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2183244675397873, + "epoch": 3.1330128205128207, + "grad_norm": 0.011082753539085388, + "learning_rate": 1e-06, + "loss": 0.1257, + "step": 1955 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.21788176894187927, + "epoch": 3.1346153846153846, + "grad_norm": 0.012296471744775772, + "learning_rate": 1e-06, + "loss": 0.1528, + "step": 1956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2483.0, + "completions/mean_length": 3008.599609375, + "completions/mean_terminated_length": 1464.163330078125, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "entropy": 0.2399488091468811, + "epoch": 3.136217948717949, + "frac_reward_zero_std": 0.15625, + "grad_norm": 634.225341796875, + "learning_rate": 1e-06, + "loss": 0.1405, + "num_tokens": 1191638702.0, + "reward": 0.31707698106765747, + "reward_std": 0.049435004591941833, + "rewards/progression_diversity/mean": -0.02521352283656597, + "rewards/progression_diversity/std": 0.07498207688331604, + "rewards/symbolic_reward_accuracy/mean": 0.169921875, + "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, + "rewards/symbolic_reward_partial_score/mean": 0.7185709476470947, + "rewards/symbolic_reward_partial_score/std": 0.21633154153823853, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985991895198822, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 37.30690002441406, + "step": 1957 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2299083024263382, + "epoch": 3.1378205128205128, + "grad_norm": 0.010621101595461369, + "learning_rate": 1e-06, + "loss": 0.1298, + "step": 1958 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2408609539270401, + "epoch": 3.139423076923077, + "grad_norm": 0.011294880881905556, + "learning_rate": 1e-06, + "loss": 0.0936, + "step": 1959 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.23295968770980835, + "epoch": 3.141025641025641, + "grad_norm": 0.014650301076471806, + "learning_rate": 1e-06, + "loss": 0.0687, + "step": 1960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2611.0, + "completions/mean_length": 3111.46875, + "completions/mean_terminated_length": 1415.8590087890625, + "completions/min_length": 392.0, + "completions/min_terminated_length": 392.0, + "entropy": 0.24924977123737335, + "epoch": 3.1426282051282053, + "frac_reward_zero_std": 0.1875, + "grad_norm": 237.02276611328125, + "learning_rate": 1e-06, + "loss": 0.0949, + "num_tokens": 1194090830.0, + "reward": 0.3610386848449707, + "reward_std": 0.037800274789333344, + "rewards/progression_diversity/mean": -0.028455514460802078, + "rewards/progression_diversity/std": 0.08039110153913498, + "rewards/symbolic_reward_accuracy/mean": 0.224609375, + "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, + "rewards/symbolic_reward_partial_score/mean": 0.7551920413970947, + "rewards/symbolic_reward_partial_score/std": 0.19215147197246552, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815695285797119, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 40.95595169067383, + "step": 1961 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.22469578683376312, + "epoch": 3.144230769230769, + "grad_norm": 0.009706591255962849, + "learning_rate": 1e-06, + "loss": 0.1529, + "step": 1962 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.22220056504011154, + "epoch": 3.1458333333333335, + "grad_norm": 0.02369273081421852, + "learning_rate": 1e-06, + "loss": 0.1648, + "step": 1963 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.25326360762119293, + "epoch": 3.1474358974358974, + "grad_norm": 0.015049627050757408, + "learning_rate": 1e-06, + "loss": 0.051, + "step": 1964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2528.0, + "completions/mean_length": 2517.6328125, + "completions/mean_terminated_length": 1405.9830322265625, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "entropy": 0.25177982449531555, + "epoch": 3.1490384615384617, + "frac_reward_zero_std": 0.3125, + "grad_norm": 872.0093994140625, + "learning_rate": 1e-06, + "loss": 0.0878, + "num_tokens": 1196269138.0, + "reward": 0.3650954067707062, + "reward_std": 0.032753195613622665, + "rewards/progression_diversity/mean": -0.019758375361561775, + "rewards/progression_diversity/std": 0.07037756592035294, + "rewards/symbolic_reward_accuracy/mean": 0.23828125, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.7410807013511658, + "rewards/symbolic_reward_partial_score/std": 0.20055758953094482, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.004058599472046, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 30.5396671295166, + "step": 1965 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.25476063042879105, + "epoch": 3.1506410256410255, + "grad_norm": 0.01801050268113613, + "learning_rate": 1e-06, + "loss": 0.0639, + "step": 1966 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.24143607169389725, + "epoch": 3.15224358974359, + "grad_norm": 0.009572253562510014, + "learning_rate": 1e-06, + "loss": 0.114, + "step": 1967 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2614782154560089, + "epoch": 3.1538461538461537, + "grad_norm": 0.02647644467651844, + "learning_rate": 1e-06, + "loss": 0.04, + "step": 1968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2685.0, + "completions/mean_length": 2340.412109375, + "completions/mean_terminated_length": 1497.21533203125, + "completions/min_length": 473.0, + "completions/min_terminated_length": 473.0, + "entropy": 0.2505124807357788, + "epoch": 3.155448717948718, + "frac_reward_zero_std": 0.25, + "grad_norm": 1409.9781494140625, + "learning_rate": 1e-06, + "loss": 0.1449, + "num_tokens": 1198283685.0, + "reward": 0.4389854073524475, + "reward_std": 0.043696288019418716, + "rewards/progression_diversity/mean": -0.015524028800427914, + "rewards/progression_diversity/std": 0.06471197307109833, + "rewards/symbolic_reward_accuracy/mean": 0.34765625, + "rewards/symbolic_reward_accuracy/std": 0.47669193148612976, + "rewards/symbolic_reward_partial_score/mean": 0.7691406011581421, + "rewards/symbolic_reward_partial_score/std": 0.21009880304336548, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0053313970565796, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 29.34543800354004, + "step": 1969 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2559570074081421, + "epoch": 3.157051282051282, + "grad_norm": 1054.045654296875, + "learning_rate": 1e-06, + "loss": 0.082, + "step": 1970 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2517922967672348, + "epoch": 3.1586538461538463, + "grad_norm": 10.967491149902344, + "learning_rate": 1e-06, + "loss": -0.0051, + "step": 1971 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24725372344255447, + "epoch": 3.16025641025641, + "grad_norm": 0.010240145027637482, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 1972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2716.0, + "completions/mean_length": 2950.55859375, + "completions/mean_terminated_length": 1496.722900390625, + "completions/min_length": 423.0, + "completions/min_terminated_length": 423.0, + "entropy": 0.25035959482192993, + "epoch": 3.1618589743589745, + "frac_reward_zero_std": 0.25, + "grad_norm": 633.6778564453125, + "learning_rate": 1e-06, + "loss": 0.0946, + "num_tokens": 1200602867.0, + "reward": 0.33387240767478943, + "reward_std": 0.026298727840185165, + "rewards/progression_diversity/mean": -0.02584683895111084, + "rewards/progression_diversity/std": 0.07925648987293243, + "rewards/symbolic_reward_accuracy/mean": 0.197265625, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.71923828125, + "rewards/symbolic_reward_partial_score/std": 0.1927872896194458, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9945404529571533, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 35.76648712158203, + "step": 1973 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.25055961310863495, + "epoch": 3.1634615384615383, + "grad_norm": 278.57867431640625, + "learning_rate": 1e-06, + "loss": 0.0723, + "step": 1974 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23513774573802948, + "epoch": 3.1650641025641026, + "grad_norm": 0.15121746063232422, + "learning_rate": 1e-06, + "loss": 0.1125, + "step": 1975 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2441992089152336, + "epoch": 3.1666666666666665, + "grad_norm": 0.013731342740356922, + "learning_rate": 1e-06, + "loss": 0.0602, + "step": 1976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3033.0, + "completions/mean_length": 3148.57421875, + "completions/mean_terminated_length": 1457.704833984375, + "completions/min_length": 573.0, + "completions/min_terminated_length": 573.0, + "entropy": 0.21663058549165726, + "epoch": 3.168269230769231, + "frac_reward_zero_std": 0.09375, + "grad_norm": 515.6699829101562, + "learning_rate": 1e-06, + "loss": 0.0905, + "num_tokens": 1203158377.0, + "reward": 0.2782261371612549, + "reward_std": 0.03023504465818405, + "rewards/progression_diversity/mean": -0.029924802482128143, + "rewards/progression_diversity/std": 0.08487491309642792, + "rewards/symbolic_reward_accuracy/mean": 0.119140625, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.6914387941360474, + "rewards/symbolic_reward_partial_score/std": 0.19170799851417542, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9659879207611084, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 49.51683807373047, + "step": 1977 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2332163155078888, + "epoch": 3.1698717948717947, + "grad_norm": 2.1581897735595703, + "learning_rate": 1e-06, + "loss": 0.0941, + "step": 1978 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.21122775226831436, + "epoch": 3.171474358974359, + "grad_norm": 0.049657199531793594, + "learning_rate": 1e-06, + "loss": 0.1684, + "step": 1979 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.24628569930791855, + "epoch": 3.173076923076923, + "grad_norm": 0.019608579576015472, + "learning_rate": 1e-06, + "loss": 0.0439, + "step": 1980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3310.0, + "completions/mean_length": 2705.47265625, + "completions/mean_terminated_length": 1451.3646240234375, + "completions/min_length": 542.0, + "completions/min_terminated_length": 542.0, + "entropy": 0.26659801602363586, + "epoch": 3.1746794871794872, + "frac_reward_zero_std": 0.21875, + "grad_norm": 256.61602783203125, + "learning_rate": 1e-06, + "loss": 0.0419, + "num_tokens": 1205377627.0, + "reward": 0.40071532130241394, + "reward_std": 0.029286310076713562, + "rewards/progression_diversity/mean": -0.022705569863319397, + "rewards/progression_diversity/std": 0.07598486542701721, + "rewards/symbolic_reward_accuracy/mean": 0.27734375, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.7830891609191895, + "rewards/symbolic_reward_partial_score/std": 0.1975134015083313, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9810075759887695, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 43.57196044921875, + "step": 1981 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23883583396673203, + "epoch": 3.176282051282051, + "grad_norm": 2.147189140319824, + "learning_rate": 1e-06, + "loss": 0.0706, + "step": 1982 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.26977184414863586, + "epoch": 3.1778846153846154, + "grad_norm": 78.77035522460938, + "learning_rate": 1e-06, + "loss": 0.0521, + "step": 1983 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23553596436977386, + "epoch": 3.1794871794871793, + "grad_norm": 0.018339024856686592, + "learning_rate": 1e-06, + "loss": 0.5282, + "step": 1984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2776.0, + "completions/mean_length": 2496.416015625, + "completions/mean_terminated_length": 1477.410888671875, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "entropy": 0.2516746520996094, + "epoch": 3.1810897435897436, + "frac_reward_zero_std": 0.25, + "grad_norm": 319.26788330078125, + "learning_rate": 1e-06, + "loss": 0.0477, + "num_tokens": 1207524576.0, + "reward": 0.3050941824913025, + "reward_std": 0.039040304720401764, + "rewards/progression_diversity/mean": -0.017927538603544235, + "rewards/progression_diversity/std": 0.06751146912574768, + "rewards/symbolic_reward_accuracy/mean": 0.146484375, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.7259114980697632, + "rewards/symbolic_reward_partial_score/std": 0.20375892519950867, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0033478736877441, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 31.887889862060547, + "step": 1985 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.26808346807956696, + "epoch": 3.1826923076923075, + "grad_norm": 0.020371215417981148, + "learning_rate": 1e-06, + "loss": 0.0406, + "step": 1986 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.24439021944999695, + "epoch": 3.184294871794872, + "grad_norm": 0.026235945522785187, + "learning_rate": 1e-06, + "loss": 0.0765, + "step": 1987 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.25858327746391296, + "epoch": 3.185897435897436, + "grad_norm": 0.016966918483376503, + "learning_rate": 1e-06, + "loss": 0.0628, + "step": 1988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2761.0, + "completions/mean_length": 1891.138671875, + "completions/mean_terminated_length": 1363.0587158203125, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "entropy": 0.28316083550453186, + "epoch": 3.1875, + "frac_reward_zero_std": 0.3125, + "grad_norm": 3698.61865234375, + "learning_rate": 1e-06, + "loss": 0.0706, + "num_tokens": 1209406615.0, + "reward": 0.3008219003677368, + "reward_std": 0.029717685654759407, + "rewards/progression_diversity/mean": -0.010097447782754898, + "rewards/progression_diversity/std": 0.053449101746082306, + "rewards/symbolic_reward_accuracy/mean": 0.142578125, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.7185709476470947, + "rewards/symbolic_reward_partial_score/std": 0.17875374853610992, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0198476314544678, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 25.146913528442383, + "step": 1989 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2714526355266571, + "epoch": 3.189102564102564, + "grad_norm": 7592.54052734375, + "learning_rate": 1e-06, + "loss": 0.8777, + "step": 1990 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.28693751990795135, + "epoch": 3.190705128205128, + "grad_norm": 20502.578125, + "learning_rate": 1e-06, + "loss": 2.4146, + "step": 1991 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.283732146024704, + "epoch": 3.1923076923076925, + "grad_norm": 0.011515190824866295, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 1992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2856.0, + "completions/mean_length": 1891.53125, + "completions/mean_terminated_length": 1363.465576171875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.2893388271331787, + "epoch": 3.1939102564102564, + "frac_reward_zero_std": 0.375, + "grad_norm": 136.03277587890625, + "learning_rate": 1e-06, + "loss": 0.0309, + "num_tokens": 1211197511.0, + "reward": 0.38298678398132324, + "reward_std": 0.05362379923462868, + "rewards/progression_diversity/mean": -0.010407653637230396, + "rewards/progression_diversity/std": 0.05494730547070503, + "rewards/symbolic_reward_accuracy/mean": 0.26171875, + "rewards/symbolic_reward_accuracy/std": 0.44000017642974854, + "rewards/symbolic_reward_partial_score/mean": 0.754833996295929, + "rewards/symbolic_reward_partial_score/std": 0.21281343698501587, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0291106700897217, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 21.492839813232422, + "step": 1993 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2943323999643326, + "epoch": 3.1955128205128207, + "grad_norm": 0.021322406828403473, + "learning_rate": 1e-06, + "loss": 0.033, + "step": 1994 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.29063861072063446, + "epoch": 3.1971153846153846, + "grad_norm": 0.012699613347649574, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 1995 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.28081028163433075, + "epoch": 3.198717948717949, + "grad_norm": 0.00812804140150547, + "learning_rate": 1e-06, + "loss": 0.0823, + "step": 1996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2786.0, + "completions/mean_length": 1646.9921875, + "completions/mean_terminated_length": 1353.4263916015625, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "entropy": 0.30514436960220337, + "epoch": 3.2003205128205128, + "frac_reward_zero_std": 0.4375, + "grad_norm": 60.97207260131836, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 1212900051.0, + "reward": 0.32839280366897583, + "reward_std": 0.04186505824327469, + "rewards/progression_diversity/mean": -0.005448305979371071, + "rewards/progression_diversity/std": 0.039716459810733795, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.7497720718383789, + "rewards/symbolic_reward_partial_score/std": 0.1919063776731491, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0453128814697266, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 13.722127914428711, + "step": 1997 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.29429739713668823, + "epoch": 3.201923076923077, + "grad_norm": 349.4608459472656, + "learning_rate": 1e-06, + "loss": 0.0811, + "step": 1998 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.30618786811828613, + "epoch": 3.203525641025641, + "grad_norm": 0.018446296453475952, + "learning_rate": 1e-06, + "loss": 0.0303, + "step": 1999 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.29249128699302673, + "epoch": 3.2051282051282053, + "grad_norm": 0.015572361648082733, + "learning_rate": 1e-06, + "loss": 0.0433, + "step": 2000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2926.0, + "completions/mean_length": 1567.892578125, + "completions/mean_terminated_length": 1362.5208740234375, + "completions/min_length": 600.0, + "completions/min_terminated_length": 600.0, + "entropy": 0.2950544059276581, + "epoch": 3.206730769230769, + "frac_reward_zero_std": 0.34375, + "grad_norm": 891.14892578125, + "learning_rate": 1e-06, + "loss": 0.0646, + "num_tokens": 1214607692.0, + "reward": 0.26205992698669434, + "reward_std": 0.03215021640062332, + "rewards/progression_diversity/mean": -0.003969744313508272, + "rewards/progression_diversity/std": 0.03686157613992691, + "rewards/symbolic_reward_accuracy/mean": 0.087890625, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.6991862058639526, + "rewards/symbolic_reward_partial_score/std": 0.1865617036819458, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0514659881591797, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 10.364349365234375, + "step": 2001 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.29572050273418427, + "epoch": 3.2083333333333335, + "grad_norm": 0.008014540188014507, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 2002 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2993382215499878, + "epoch": 3.2099358974358974, + "grad_norm": 0.021720819175243378, + "learning_rate": 1e-06, + "loss": -0.0056, + "step": 2003 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3016369044780731, + "epoch": 3.2115384615384617, + "grad_norm": 0.0189630426466465, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3574.0, + "completions/mean_length": 1687.447265625, + "completions/mean_terminated_length": 1364.7684326171875, + "completions/min_length": 483.0, + "completions/min_terminated_length": 483.0, + "entropy": 0.29340440034866333, + "epoch": 3.2131410256410255, + "frac_reward_zero_std": 0.5, + "grad_norm": 509.89044189453125, + "learning_rate": 1e-06, + "loss": 0.0367, + "num_tokens": 1216400945.0, + "reward": 0.3805909752845764, + "reward_std": 0.03878272697329521, + "rewards/progression_diversity/mean": -0.007796908728778362, + "rewards/progression_diversity/std": 0.05300579220056534, + "rewards/symbolic_reward_accuracy/mean": 0.259765625, + "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, + "rewards/symbolic_reward_partial_score/mean": 0.7513183355331421, + "rewards/symbolic_reward_partial_score/std": 0.19548480212688446, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0402953624725342, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 16.1550235748291, + "step": 2005 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.30069899559020996, + "epoch": 3.21474358974359, + "grad_norm": 0.02268948033452034, + "learning_rate": 1e-06, + "loss": 0.0185, + "step": 2006 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3007173389196396, + "epoch": 3.2163461538461537, + "grad_norm": 0.008483972400426865, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 2007 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2845690995454788, + "epoch": 3.217948717948718, + "grad_norm": 0.010253841057419777, + "learning_rate": 1e-06, + "loss": 0.0552, + "step": 2008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2020.0, + "completions/mean_length": 1595.154296875, + "completions/mean_terminated_length": 1360.4107666015625, + "completions/min_length": 422.0, + "completions/min_terminated_length": 422.0, + "entropy": 0.28091390430927277, + "epoch": 3.219551282051282, + "frac_reward_zero_std": 0.5625, + "grad_norm": 675.5444946289062, + "learning_rate": 1e-06, + "loss": 0.0337, + "num_tokens": 1218156384.0, + "reward": 0.2970985472202301, + "reward_std": 0.03322647139430046, + "rewards/progression_diversity/mean": -0.005477225407958031, + "rewards/progression_diversity/std": 0.044522009789943695, + "rewards/symbolic_reward_accuracy/mean": 0.142578125, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.7066569328308105, + "rewards/symbolic_reward_partial_score/std": 0.1901051253080368, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0481798648834229, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 11.399556159973145, + "step": 2009 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29631251096725464, + "epoch": 3.2211538461538463, + "grad_norm": 0.01047491654753685, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 2010 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2931921035051346, + "epoch": 3.22275641025641, + "grad_norm": 0.03937433660030365, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 2011 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.29075874388217926, + "epoch": 3.2243589743589745, + "grad_norm": 0.010890385136008263, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2629.0, + "completions/mean_length": 1558.37890625, + "completions/mean_terminated_length": 1382.5810546875, + "completions/min_length": 729.0, + "completions/min_terminated_length": 729.0, + "entropy": 0.282899871468544, + "epoch": 3.2259615384615383, + "frac_reward_zero_std": 0.5, + "grad_norm": 980.3523559570312, + "learning_rate": 1e-06, + "loss": 0.0471, + "num_tokens": 1219817762.0, + "reward": 0.3504934310913086, + "reward_std": 0.030621502548456192, + "rewards/progression_diversity/mean": -0.004368623252958059, + "rewards/progression_diversity/std": 0.040239714086055756, + "rewards/symbolic_reward_accuracy/mean": 0.228515625, + "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, + "rewards/symbolic_reward_partial_score/mean": 0.71142578125, + "rewards/symbolic_reward_partial_score/std": 0.2032359093427658, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0505871772766113, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 9.663378715515137, + "step": 2013 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2933056503534317, + "epoch": 3.2275641025641026, + "grad_norm": 0.016422249376773834, + "learning_rate": 1e-06, + "loss": 0.0012, + "step": 2014 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2866353541612625, + "epoch": 3.2291666666666665, + "grad_norm": 0.020436976104974747, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 2015 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.28925251960754395, + "epoch": 3.230769230769231, + "grad_norm": 0.018621278926730156, + "learning_rate": 1e-06, + "loss": 0.0175, + "step": 2016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2538.0, + "completions/mean_length": 1511.5234375, + "completions/mean_terminated_length": 1335.1700439453125, + "completions/min_length": 736.0, + "completions/min_terminated_length": 736.0, + "entropy": 0.29188382625579834, + "epoch": 3.2323717948717947, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.03130757808685303, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 1221518830.0, + "reward": 0.3500271439552307, + "reward_std": 0.07550652325153351, + "rewards/progression_diversity/mean": -0.0036349566653370857, + "rewards/progression_diversity/std": 0.03353985399007797, + "rewards/symbolic_reward_accuracy/mean": 0.23828125, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.6909668445587158, + "rewards/symbolic_reward_partial_score/std": 0.22590909898281097, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0503188371658325, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 8.717741012573242, + "step": 2017 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2773764133453369, + "epoch": 3.233974358974359, + "grad_norm": 0.017145292833447456, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 2018 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.27354469895362854, + "epoch": 3.235576923076923, + "grad_norm": 0.011831255629658699, + "learning_rate": 1e-06, + "loss": 0.0443, + "step": 2019 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2817593514919281, + "epoch": 3.2371794871794872, + "grad_norm": 0.016237668693065643, + "learning_rate": 1e-06, + "loss": 0.0098, + "step": 2020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2421.0, + "completions/mean_length": 1795.888671875, + "completions/mean_terminated_length": 1325.304443359375, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "entropy": 0.2711557298898697, + "epoch": 3.238782051282051, + "frac_reward_zero_std": 0.40625, + "grad_norm": 163.37515258789062, + "learning_rate": 1e-06, + "loss": 0.0598, + "num_tokens": 1223317701.0, + "reward": 0.3071795105934143, + "reward_std": 0.022627130150794983, + "rewards/progression_diversity/mean": -0.010563986375927925, + "rewards/progression_diversity/std": 0.05871390923857689, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.6570963263511658, + "rewards/symbolic_reward_partial_score/std": 0.21646232903003693, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0290942192077637, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 19.123592376708984, + "step": 2021 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2706785798072815, + "epoch": 3.2403846153846154, + "grad_norm": 0.007887561805546284, + "learning_rate": 1e-06, + "loss": 0.0574, + "step": 2022 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2705296725034714, + "epoch": 3.2419871794871793, + "grad_norm": 0.016599016264081, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2023 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.26619046926498413, + "epoch": 3.2435897435897436, + "grad_norm": 0.014114703983068466, + "learning_rate": 1e-06, + "loss": 0.049, + "step": 2024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2063.0, + "completions/mean_length": 1582.685546875, + "completions/mean_terminated_length": 1317.850830078125, + "completions/min_length": 603.0, + "completions/min_terminated_length": 603.0, + "entropy": 0.27589017152786255, + "epoch": 3.2451923076923075, + "frac_reward_zero_std": 0.5, + "grad_norm": 714.7753295898438, + "learning_rate": 1e-06, + "loss": 0.0405, + "num_tokens": 1224991732.0, + "reward": 0.32780733704566956, + "reward_std": 0.02535293623805046, + "rewards/progression_diversity/mean": -0.006377549842000008, + "rewards/progression_diversity/std": 0.04781962186098099, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.7276692390441895, + "rewards/symbolic_reward_partial_score/std": 0.2073507308959961, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.040276288986206, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 13.916572570800781, + "step": 2025 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.278276264667511, + "epoch": 3.246794871794872, + "grad_norm": 641.1627807617188, + "learning_rate": 1e-06, + "loss": 0.081, + "step": 2026 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2818632125854492, + "epoch": 3.248397435897436, + "grad_norm": 0.015987196937203407, + "learning_rate": 1e-06, + "loss": 0.0139, + "step": 2027 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2770925313234329, + "epoch": 3.25, + "grad_norm": 0.011531771160662174, + "learning_rate": 1e-06, + "loss": -0.0061, + "step": 2028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2199.0, + "completions/mean_length": 1594.681640625, + "completions/mean_terminated_length": 1330.0615234375, + "completions/min_length": 639.0, + "completions/min_terminated_length": 639.0, + "entropy": 0.28490838408470154, + "epoch": 3.251602564102564, + "frac_reward_zero_std": 0.53125, + "grad_norm": 630.0040283203125, + "learning_rate": 1e-06, + "loss": 0.0141, + "num_tokens": 1226658273.0, + "reward": 0.3503781259059906, + "reward_std": 0.02471214532852173, + "rewards/progression_diversity/mean": -0.0061342837288975716, + "rewards/progression_diversity/std": 0.0460776686668396, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.7469075322151184, + "rewards/symbolic_reward_partial_score/std": 0.19170016050338745, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.040297508239746, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 14.458892822265625, + "step": 2029 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.28325770795345306, + "epoch": 3.253205128205128, + "grad_norm": 0.02777559868991375, + "learning_rate": 1e-06, + "loss": 0.0386, + "step": 2030 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2924428731203079, + "epoch": 3.2548076923076925, + "grad_norm": 0.005939020775258541, + "learning_rate": 1e-06, + "loss": -0.0084, + "step": 2031 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.26891082525253296, + "epoch": 3.2564102564102564, + "grad_norm": 0.010626004077494144, + "learning_rate": 1e-06, + "loss": 0.1295, + "step": 2032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2179.0, + "completions/mean_length": 1786.677734375, + "completions/mean_terminated_length": 1315.79638671875, + "completions/min_length": 671.0, + "completions/min_terminated_length": 671.0, + "entropy": 0.29806482791900635, + "epoch": 3.2580128205128207, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.012230968102812767, + "learning_rate": 1e-06, + "loss": -0.0145, + "num_tokens": 1228425196.0, + "reward": 0.32576221227645874, + "reward_std": 0.05162687227129936, + "rewards/progression_diversity/mean": -0.010695156641304493, + "rewards/progression_diversity/std": 0.05995798856019974, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.717089831829071, + "rewards/symbolic_reward_partial_score/std": 0.22520095109939575, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0235651731491089, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 23.860057830810547, + "step": 2033 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2714930325746536, + "epoch": 3.2596153846153846, + "grad_norm": 0.009989427402615547, + "learning_rate": 1e-06, + "loss": 0.145, + "step": 2034 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2667497843503952, + "epoch": 3.261217948717949, + "grad_norm": 0.01684493198990822, + "learning_rate": 1e-06, + "loss": 0.1274, + "step": 2035 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2843262106180191, + "epoch": 3.2628205128205128, + "grad_norm": 0.014857680536806583, + "learning_rate": 1e-06, + "loss": 0.0398, + "step": 2036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3595.0, + "completions/mean_length": 1610.28125, + "completions/mean_terminated_length": 1345.9403076171875, + "completions/min_length": 710.0, + "completions/min_terminated_length": 710.0, + "entropy": 0.2858640253543854, + "epoch": 3.264423076923077, + "frac_reward_zero_std": 0.53125, + "grad_norm": 336.9473571777344, + "learning_rate": 1e-06, + "loss": 0.0252, + "num_tokens": 1230085228.0, + "reward": 0.3623395264148712, + "reward_std": 0.04071733355522156, + "rewards/progression_diversity/mean": -0.005795993376523256, + "rewards/progression_diversity/std": 0.04330339655280113, + "rewards/symbolic_reward_accuracy/mean": 0.228515625, + "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, + "rewards/symbolic_reward_partial_score/mean": 0.7535644173622131, + "rewards/symbolic_reward_partial_score/std": 0.21194274723529816, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045485258102417, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 12.358026504516602, + "step": 2037 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2867206782102585, + "epoch": 3.266025641025641, + "grad_norm": 0.012359599582850933, + "learning_rate": 1e-06, + "loss": 0.0326, + "step": 2038 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.279480904340744, + "epoch": 3.2676282051282053, + "grad_norm": 0.009706364013254642, + "learning_rate": 1e-06, + "loss": 0.0751, + "step": 2039 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2926556468009949, + "epoch": 3.269230769230769, + "grad_norm": 0.013265096582472324, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 2040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3382.0, + "completions/mean_length": 1684.5390625, + "completions/mean_terminated_length": 1361.79638671875, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.27776144444942474, + "epoch": 3.2708333333333335, + "frac_reward_zero_std": 0.46875, + "grad_norm": 754.3441162109375, + "learning_rate": 1e-06, + "loss": 0.0869, + "num_tokens": 1231778640.0, + "reward": 0.43440812826156616, + "reward_std": 0.034479107707738876, + "rewards/progression_diversity/mean": -0.00694271782413125, + "rewards/progression_diversity/std": 0.04713256284594536, + "rewards/symbolic_reward_accuracy/mean": 0.326171875, + "rewards/symbolic_reward_accuracy/std": 0.4692695140838623, + "rewards/symbolic_reward_partial_score/mean": 0.7978678941726685, + "rewards/symbolic_reward_partial_score/std": 0.1925002634525299, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.036149501800537, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 17.656431198120117, + "step": 2041 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29258349537849426, + "epoch": 3.2724358974358974, + "grad_norm": 0.03969258442521095, + "learning_rate": 1e-06, + "loss": -0.0049, + "step": 2042 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2861692011356354, + "epoch": 3.2740384615384617, + "grad_norm": 0.007797840982675552, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 2043 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2861800342798233, + "epoch": 3.2756410256410255, + "grad_norm": 0.024233318865299225, + "learning_rate": 1e-06, + "loss": 0.0363, + "step": 2044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2312.0, + "completions/mean_length": 1453.421875, + "completions/mean_terminated_length": 1335.8582763671875, + "completions/min_length": 547.0, + "completions/min_terminated_length": 547.0, + "entropy": 0.29494452476501465, + "epoch": 3.27724358974359, + "frac_reward_zero_std": 0.40625, + "grad_norm": 475.1607971191406, + "learning_rate": 1e-06, + "loss": 0.0204, + "num_tokens": 1233400632.0, + "reward": 0.3379926085472107, + "reward_std": 0.04110347479581833, + "rewards/progression_diversity/mean": -0.00249581434763968, + "rewards/progression_diversity/std": 0.027295051142573357, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.7068033814430237, + "rewards/symbolic_reward_partial_score/std": 0.20671771466732025, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.05837082862854, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 6.645764350891113, + "step": 2045 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.29623962938785553, + "epoch": 3.2788461538461537, + "grad_norm": 0.015234522521495819, + "learning_rate": 1e-06, + "loss": 0.0231, + "step": 2046 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2966182827949524, + "epoch": 3.280448717948718, + "grad_norm": 0.028103666380047798, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 2047 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.29752831161022186, + "epoch": 3.282051282051282, + "grad_norm": 0.016002511605620384, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 2048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2178.0, + "completions/mean_length": 1659.55859375, + "completions/mean_terminated_length": 1396.099365234375, + "completions/min_length": 734.0, + "completions/min_terminated_length": 734.0, + "entropy": 0.3017704486846924, + "epoch": 3.2836538461538463, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.022180970758199692, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 1235048998.0, + "reward": 0.34624069929122925, + "reward_std": 0.03489365801215172, + "rewards/progression_diversity/mean": -0.005324858706444502, + "rewards/progression_diversity/std": 0.0404798686504364, + "rewards/symbolic_reward_accuracy/mean": 0.20703125, + "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, + "rewards/symbolic_reward_partial_score/mean": 0.7448079586029053, + "rewards/symbolic_reward_partial_score/std": 0.2022588700056076, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0514497756958008, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 10.927322387695312, + "step": 2049 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.09375, + "entropy": 0.3079407513141632, + "epoch": 3.28525641025641, + "grad_norm": 1578.2557373046875, + "learning_rate": 1e-06, + "loss": 0.112, + "step": 2050 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.29569441080093384, + "epoch": 3.2868589743589745, + "grad_norm": 0.012577169574797153, + "learning_rate": 1e-06, + "loss": 0.0704, + "step": 2051 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2986031025648117, + "epoch": 3.2884615384615383, + "grad_norm": 0.0136114452034235, + "learning_rate": 1e-06, + "loss": 0.0306, + "step": 2052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2134.0, + "completions/mean_length": 1793.57421875, + "completions/mean_terminated_length": 1443.404052734375, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "entropy": 0.306133508682251, + "epoch": 3.2900641025641026, + "frac_reward_zero_std": 0.3125, + "grad_norm": 395.223876953125, + "learning_rate": 1e-06, + "loss": 0.0256, + "num_tokens": 1236814236.0, + "reward": 0.4016110897064209, + "reward_std": 0.042516425251960754, + "rewards/progression_diversity/mean": -0.007838626392185688, + "rewards/progression_diversity/std": 0.05088994279503822, + "rewards/symbolic_reward_accuracy/mean": 0.267578125, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.8090169429779053, + "rewards/symbolic_reward_partial_score/std": 0.18975462019443512, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0399129390716553, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 16.826345443725586, + "step": 2053 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.304185152053833, + "epoch": 3.2916666666666665, + "grad_norm": 0.020139280706644058, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 2054 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.28031550347805023, + "epoch": 3.293269230769231, + "grad_norm": 0.008938499726355076, + "learning_rate": 1e-06, + "loss": 0.2255, + "step": 2055 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.29509237408638, + "epoch": 3.2948717948717947, + "grad_norm": 0.008660328574478626, + "learning_rate": 1e-06, + "loss": 0.0369, + "step": 2056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2419.0, + "completions/mean_length": 1667.703125, + "completions/mean_terminated_length": 1434.1112060546875, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.2912152111530304, + "epoch": 3.296474358974359, + "frac_reward_zero_std": 0.375, + "grad_norm": 549.008056640625, + "learning_rate": 1e-06, + "loss": 0.0571, + "num_tokens": 1238580100.0, + "reward": 0.28469109535217285, + "reward_std": 0.039379484951496124, + "rewards/progression_diversity/mean": -0.004526130389422178, + "rewards/progression_diversity/std": 0.03515040501952171, + "rewards/symbolic_reward_accuracy/mean": 0.119140625, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.7153971195220947, + "rewards/symbolic_reward_partial_score/std": 0.1853335201740265, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0521401166915894, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 10.635208129882812, + "step": 2057 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2999632656574249, + "epoch": 3.298076923076923, + "grad_norm": 0.015014038421213627, + "learning_rate": 1e-06, + "loss": 0.0142, + "step": 2058 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2982807457447052, + "epoch": 3.2996794871794872, + "grad_norm": 0.021116536110639572, + "learning_rate": 1e-06, + "loss": 0.0415, + "step": 2059 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.30165646970272064, + "epoch": 3.301282051282051, + "grad_norm": 0.02505728229880333, + "learning_rate": 1e-06, + "loss": 0.0484, + "step": 2060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2724.0, + "completions/mean_length": 1967.10546875, + "completions/mean_terminated_length": 1441.7935791015625, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "entropy": 0.28198811411857605, + "epoch": 3.3028846153846154, + "frac_reward_zero_std": 0.3125, + "grad_norm": 2657.8173828125, + "learning_rate": 1e-06, + "loss": 0.1443, + "num_tokens": 1240427130.0, + "reward": 0.38248294591903687, + "reward_std": 0.058625657111406326, + "rewards/progression_diversity/mean": -0.010982503183186054, + "rewards/progression_diversity/std": 0.05810944363474846, + "rewards/symbolic_reward_accuracy/mean": 0.271484375, + "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, + "rewards/symbolic_reward_partial_score/mean": 0.7408040761947632, + "rewards/symbolic_reward_partial_score/std": 0.22931715846061707, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0257844924926758, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 24.58890151977539, + "step": 2061 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.305512472987175, + "epoch": 3.3044871794871793, + "grad_norm": 0.010081891901791096, + "learning_rate": 1e-06, + "loss": 0.0085, + "step": 2062 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.30105258524417877, + "epoch": 3.3060897435897436, + "grad_norm": 0.019923444837331772, + "learning_rate": 1e-06, + "loss": 0.0702, + "step": 2063 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2942555248737335, + "epoch": 3.3076923076923075, + "grad_norm": 0.01970398612320423, + "learning_rate": 1e-06, + "loss": 0.0693, + "step": 2064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2750.0, + "completions/mean_length": 1714.302734375, + "completions/mean_terminated_length": 1481.450439453125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "entropy": 0.3109530508518219, + "epoch": 3.309294871794872, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02412906475365162, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 1242207125.0, + "reward": 0.36652231216430664, + "reward_std": 0.056298792362213135, + "rewards/progression_diversity/mean": -0.005487251095473766, + "rewards/progression_diversity/std": 0.04296133294701576, + "rewards/symbolic_reward_accuracy/mean": 0.25, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.7264810800552368, + "rewards/symbolic_reward_partial_score/std": 0.2101718783378601, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0529332160949707, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 11.437311172485352, + "step": 2065 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2926006019115448, + "epoch": 3.310897435897436, + "grad_norm": 0.01898755133152008, + "learning_rate": 1e-06, + "loss": 0.0776, + "step": 2066 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3063723295927048, + "epoch": 3.3125, + "grad_norm": 0.014987506903707981, + "learning_rate": 1e-06, + "loss": 0.0091, + "step": 2067 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.29933975636959076, + "epoch": 3.314102564102564, + "grad_norm": 0.020037300884723663, + "learning_rate": 1e-06, + "loss": 0.0595, + "step": 2068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2365.0, + "completions/mean_length": 1686.85546875, + "completions/mean_terminated_length": 1453.5675048828125, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "entropy": 0.3088431656360626, + "epoch": 3.315705128205128, + "frac_reward_zero_std": 0.5, + "grad_norm": 310.1370544433594, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 1243928939.0, + "reward": 0.328504741191864, + "reward_std": 0.027106130495667458, + "rewards/progression_diversity/mean": -0.005487233866006136, + "rewards/progression_diversity/std": 0.04299917817115784, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.7319173216819763, + "rewards/symbolic_reward_partial_score/std": 0.20580171048641205, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0549488067626953, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 10.485675811767578, + "step": 2069 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3059859573841095, + "epoch": 3.3173076923076925, + "grad_norm": 178.32826232910156, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 2070 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.29551123082637787, + "epoch": 3.3189102564102564, + "grad_norm": 0.01883271336555481, + "learning_rate": 1e-06, + "loss": 0.0948, + "step": 2071 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.30632631480693817, + "epoch": 3.3205128205128207, + "grad_norm": 0.015535010024905205, + "learning_rate": 1e-06, + "loss": 0.005, + "step": 2072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2583.0, + "completions/mean_length": 1778.39453125, + "completions/mean_terminated_length": 1487.4462890625, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.30395227670669556, + "epoch": 3.3221153846153846, + "frac_reward_zero_std": 0.40625, + "grad_norm": 893.9067993164062, + "learning_rate": 1e-06, + "loss": 0.0203, + "num_tokens": 1245691061.0, + "reward": 0.36694616079330444, + "reward_std": 0.03838976100087166, + "rewards/progression_diversity/mean": -0.006556159816682339, + "rewards/progression_diversity/std": 0.04625479876995087, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.7370442152023315, + "rewards/symbolic_reward_partial_score/std": 0.21306481957435608, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0472426414489746, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 14.168490409851074, + "step": 2073 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2978048026561737, + "epoch": 3.323717948717949, + "grad_norm": 10739.20703125, + "learning_rate": 1e-06, + "loss": 0.3728, + "step": 2074 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30055759847164154, + "epoch": 3.3253205128205128, + "grad_norm": 0.01299914252012968, + "learning_rate": 1e-06, + "loss": 0.0633, + "step": 2075 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.30361442267894745, + "epoch": 3.326923076923077, + "grad_norm": 0.014193286187946796, + "learning_rate": 1e-06, + "loss": 0.0325, + "step": 2076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3344.0, + "completions/mean_length": 1754.5703125, + "completions/mean_terminated_length": 1463.1474609375, + "completions/min_length": 561.0, + "completions/min_terminated_length": 561.0, + "entropy": 0.2972850501537323, + "epoch": 3.328525641025641, + "frac_reward_zero_std": 0.5625, + "grad_norm": 378.9573974609375, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 1247440089.0, + "reward": 0.43345290422439575, + "reward_std": 0.04101278632879257, + "rewards/progression_diversity/mean": -0.005784530192613602, + "rewards/progression_diversity/std": 0.04120200499892235, + "rewards/symbolic_reward_accuracy/mean": 0.3359375, + "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, + "rewards/symbolic_reward_partial_score/mean": 0.778369128704071, + "rewards/symbolic_reward_partial_score/std": 0.21870289742946625, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.049149513244629, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 12.310261726379395, + "step": 2077 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.29869405925273895, + "epoch": 3.3301282051282053, + "grad_norm": 0.006345598492771387, + "learning_rate": 1e-06, + "loss": 0.0603, + "step": 2078 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2994239032268524, + "epoch": 3.331730769230769, + "grad_norm": 0.013875186443328857, + "learning_rate": 1e-06, + "loss": 0.0434, + "step": 2079 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2941579520702362, + "epoch": 3.3333333333333335, + "grad_norm": 0.03364482894539833, + "learning_rate": 1e-06, + "loss": 0.0486, + "step": 2080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2217.0, + "completions/mean_length": 1733.458984375, + "completions/mean_terminated_length": 1471.322021484375, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "entropy": 0.29007087647914886, + "epoch": 3.3349358974358974, + "frac_reward_zero_std": 0.34375, + "grad_norm": 648.8250732421875, + "learning_rate": 1e-06, + "loss": 0.0486, + "num_tokens": 1249261396.0, + "reward": 0.32781529426574707, + "reward_std": 0.039317332208156586, + "rewards/progression_diversity/mean": -0.006068596616387367, + "rewards/progression_diversity/std": 0.0454968586564064, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.7361490726470947, + "rewards/symbolic_reward_partial_score/std": 0.19703876972198486, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.049651026725769, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 12.473758697509766, + "step": 2081 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.29181039333343506, + "epoch": 3.3365384615384617, + "grad_norm": 1459.4117431640625, + "learning_rate": 1e-06, + "loss": 0.1145, + "step": 2082 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.301235556602478, + "epoch": 3.3381410256410255, + "grad_norm": 0.025303874164819717, + "learning_rate": 1e-06, + "loss": -0.003, + "step": 2083 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.29415562748908997, + "epoch": 3.33974358974359, + "grad_norm": 0.023084204643964767, + "learning_rate": 1e-06, + "loss": -0.0009, + "step": 2084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2563.0, + "completions/mean_length": 1614.404296875, + "completions/mean_terminated_length": 1439.270751953125, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "entropy": 0.29135817289352417, + "epoch": 3.3413461538461537, + "frac_reward_zero_std": 0.59375, + "grad_norm": 2127.327392578125, + "learning_rate": 1e-06, + "loss": 0.0704, + "num_tokens": 1250959283.0, + "reward": 0.36038675904273987, + "reward_std": 0.021374184638261795, + "rewards/progression_diversity/mean": -0.0038059065118432045, + "rewards/progression_diversity/std": 0.03501441702246666, + "rewards/symbolic_reward_accuracy/mean": 0.248046875, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.7085774540901184, + "rewards/symbolic_reward_partial_score/std": 0.2277929186820984, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0553886890411377, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 9.732671737670898, + "step": 2085 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.30615630745887756, + "epoch": 3.342948717948718, + "grad_norm": 0.018460825085639954, + "learning_rate": 1e-06, + "loss": 0.0185, + "step": 2086 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30728329718112946, + "epoch": 3.344551282051282, + "grad_norm": 0.009494633413851261, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 2087 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.31128981709480286, + "epoch": 3.3461538461538463, + "grad_norm": 0.007844570092856884, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 2088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2454.0, + "completions/mean_length": 1687.4375, + "completions/mean_terminated_length": 1454.1588134765625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "entropy": 0.309854656457901, + "epoch": 3.34775641025641, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.018283460289239883, + "learning_rate": 1e-06, + "loss": 0.0238, + "num_tokens": 1252780163.0, + "reward": 0.3683873414993286, + "reward_std": 0.03664931654930115, + "rewards/progression_diversity/mean": -0.0050177741795778275, + "rewards/progression_diversity/std": 0.03996478021144867, + "rewards/symbolic_reward_accuracy/mean": 0.25, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.7300781011581421, + "rewards/symbolic_reward_partial_score/std": 0.2158641666173935, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0487573146820068, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 13.234875679016113, + "step": 2089 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2977482080459595, + "epoch": 3.3493589743589745, + "grad_norm": 3524.53466796875, + "learning_rate": 1e-06, + "loss": 0.2433, + "step": 2090 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.30203960835933685, + "epoch": 3.3509615384615383, + "grad_norm": 0.010414248332381248, + "learning_rate": 1e-06, + "loss": 0.0241, + "step": 2091 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30492982268333435, + "epoch": 3.3525641025641026, + "grad_norm": 0.02597237005829811, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2489.0, + "completions/mean_length": 1726.21484375, + "completions/mean_terminated_length": 1463.9482421875, + "completions/min_length": 914.0, + "completions/min_terminated_length": 914.0, + "entropy": 0.30012136697769165, + "epoch": 3.3541666666666665, + "frac_reward_zero_std": 0.4375, + "grad_norm": 42.711727142333984, + "learning_rate": 1e-06, + "loss": 0.0384, + "num_tokens": 1254551649.0, + "reward": 0.33932000398635864, + "reward_std": 0.04096568375825882, + "rewards/progression_diversity/mean": -0.005503435619175434, + "rewards/progression_diversity/std": 0.041275136172771454, + "rewards/symbolic_reward_accuracy/mean": 0.203125, + "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, + "rewards/symbolic_reward_partial_score/mean": 0.7276041507720947, + "rewards/symbolic_reward_partial_score/std": 0.1983337253332138, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0483951568603516, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 13.683696746826172, + "step": 2093 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3052857369184494, + "epoch": 3.355769230769231, + "grad_norm": 0.014036121778190136, + "learning_rate": 1e-06, + "loss": 0.0409, + "step": 2094 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.304688423871994, + "epoch": 3.3573717948717947, + "grad_norm": 0.19088442623615265, + "learning_rate": 1e-06, + "loss": 0.0153, + "step": 2095 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3058953136205673, + "epoch": 3.358974358974359, + "grad_norm": 0.011440307833254337, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 2096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2805.0, + "completions/mean_length": 1805.677734375, + "completions/mean_terminated_length": 1515.27294921875, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "entropy": 0.3070167005062103, + "epoch": 3.360576923076923, + "frac_reward_zero_std": 0.65625, + "grad_norm": 1267.1373291015625, + "learning_rate": 1e-06, + "loss": 0.0541, + "num_tokens": 1256251692.0, + "reward": 0.40323367714881897, + "reward_std": 0.047273900359869, + "rewards/progression_diversity/mean": -0.0057373507879674435, + "rewards/progression_diversity/std": 0.041116341948509216, + "rewards/symbolic_reward_accuracy/mean": 0.283203125, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.7844075560569763, + "rewards/symbolic_reward_partial_score/std": 0.19797967374324799, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0512890815734863, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 12.443960189819336, + "step": 2097 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3034556806087494, + "epoch": 3.3621794871794872, + "grad_norm": 0.012835121713578701, + "learning_rate": 1e-06, + "loss": 0.0352, + "step": 2098 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.3106335401535034, + "epoch": 3.363782051282051, + "grad_norm": 0.007598002441227436, + "learning_rate": 1e-06, + "loss": 0.0361, + "step": 2099 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.30908720195293427, + "epoch": 3.3653846153846154, + "grad_norm": 0.006704273633658886, + "learning_rate": 1e-06, + "loss": 0.0375, + "step": 2100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2538.0, + "completions/mean_length": 1834.38671875, + "completions/mean_terminated_length": 1485.196044921875, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "entropy": 0.30240941047668457, + "epoch": 3.3669871794871793, + "frac_reward_zero_std": 0.5, + "grad_norm": 343.7887268066406, + "learning_rate": 1e-06, + "loss": 0.053, + "num_tokens": 1258104818.0, + "reward": 0.35596776008605957, + "reward_std": 0.044219110161066055, + "rewards/progression_diversity/mean": -0.008694911375641823, + "rewards/progression_diversity/std": 0.0567040778696537, + "rewards/symbolic_reward_accuracy/mean": 0.208984375, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.775390625, + "rewards/symbolic_reward_partial_score/std": 0.20388461649417877, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0403454303741455, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 17.884170532226562, + "step": 2101 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.30909933149814606, + "epoch": 3.3685897435897436, + "grad_norm": 0.010506085120141506, + "learning_rate": 1e-06, + "loss": 0.0457, + "step": 2102 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.29592080414295197, + "epoch": 3.3701923076923075, + "grad_norm": 0.15130743384361267, + "learning_rate": 1e-06, + "loss": 0.0234, + "step": 2103 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.29659251868724823, + "epoch": 3.371794871794872, + "grad_norm": 0.009021518751978874, + "learning_rate": 1e-06, + "loss": 0.0793, + "step": 2104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2437.0, + "completions/mean_length": 1713.470703125, + "completions/mean_terminated_length": 1510.1168212890625, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.3099695146083832, + "epoch": 3.373397435897436, + "frac_reward_zero_std": 0.40625, + "grad_norm": 328.6873779296875, + "learning_rate": 1e-06, + "loss": 0.0443, + "num_tokens": 1259801347.0, + "reward": 0.3114762306213379, + "reward_std": 0.029843464493751526, + "rewards/progression_diversity/mean": -0.004722995683550835, + "rewards/progression_diversity/std": 0.04021621122956276, + "rewards/symbolic_reward_accuracy/mean": 0.16796875, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.7057291269302368, + "rewards/symbolic_reward_partial_score/std": 0.2080366462469101, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0551323890686035, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 11.337949752807617, + "step": 2105 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.31999480724334717, + "epoch": 3.375, + "grad_norm": 0.016840625554323196, + "learning_rate": 1e-06, + "loss": 0.0714, + "step": 2106 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.32205140590667725, + "epoch": 3.376602564102564, + "grad_norm": 0.014530826359987259, + "learning_rate": 1e-06, + "loss": -0.0033, + "step": 2107 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3153335601091385, + "epoch": 3.378205128205128, + "grad_norm": 0.013645652681589127, + "learning_rate": 1e-06, + "loss": 0.0475, + "step": 2108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2217.0, + "completions/mean_length": 1905.322265625, + "completions/mean_terminated_length": 1528.122314453125, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "entropy": 0.29995962977409363, + "epoch": 3.3798076923076925, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1299.5802001953125, + "learning_rate": 1e-06, + "loss": 0.0862, + "num_tokens": 1261670184.0, + "reward": 0.3757146894931793, + "reward_std": 0.06831994652748108, + "rewards/progression_diversity/mean": -0.008122788742184639, + "rewards/progression_diversity/std": 0.05085650831460953, + "rewards/symbolic_reward_accuracy/mean": 0.267578125, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.7253092527389526, + "rewards/symbolic_reward_partial_score/std": 0.23041704297065735, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0357104539871216, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 20.614686965942383, + "step": 2109 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30187252163887024, + "epoch": 3.3814102564102564, + "grad_norm": 0.9350156188011169, + "learning_rate": 1e-06, + "loss": 0.033, + "step": 2110 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3068780303001404, + "epoch": 3.3830128205128207, + "grad_norm": 0.010901644825935364, + "learning_rate": 1e-06, + "loss": 0.0526, + "step": 2111 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.29835619032382965, + "epoch": 3.3846153846153846, + "grad_norm": 16258.962890625, + "learning_rate": 1e-06, + "loss": 1.5092, + "step": 2112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2335.0, + "completions/mean_length": 1839.962890625, + "completions/mean_terminated_length": 1550.2410888671875, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "entropy": 0.3106384575366974, + "epoch": 3.386217948717949, + "frac_reward_zero_std": 0.46875, + "grad_norm": 982.4223022460938, + "learning_rate": 1e-06, + "loss": 0.0231, + "num_tokens": 1263444485.0, + "reward": 0.4431688189506531, + "reward_std": 0.059487175196409225, + "rewards/progression_diversity/mean": -0.005386716220527887, + "rewards/progression_diversity/std": 0.03828057646751404, + "rewards/symbolic_reward_accuracy/mean": 0.357421875, + "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, + "rewards/symbolic_reward_partial_score/mean": 0.7671223878860474, + "rewards/symbolic_reward_partial_score/std": 0.2305992692708969, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045996904373169, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 15.037046432495117, + "step": 2113 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3080595135688782, + "epoch": 3.3878205128205128, + "grad_norm": 0.011529210954904556, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 2114 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3010105937719345, + "epoch": 3.389423076923077, + "grad_norm": 2453.80712890625, + "learning_rate": 1e-06, + "loss": 0.1075, + "step": 2115 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.29755890369415283, + "epoch": 3.391025641025641, + "grad_norm": 0.02169516310095787, + "learning_rate": 1e-06, + "loss": 0.0655, + "step": 2116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2517.0, + "completions/mean_length": 1782.4921875, + "completions/mean_terminated_length": 1550.7222900390625, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.3025133013725281, + "epoch": 3.3926282051282053, + "frac_reward_zero_std": 0.40625, + "grad_norm": 774.7922973632812, + "learning_rate": 1e-06, + "loss": 0.0502, + "num_tokens": 1265197169.0, + "reward": 0.38823676109313965, + "reward_std": 0.03497297316789627, + "rewards/progression_diversity/mean": -0.0049418737180531025, + "rewards/progression_diversity/std": 0.038801729679107666, + "rewards/symbolic_reward_accuracy/mean": 0.26953125, + "rewards/symbolic_reward_accuracy/std": 0.44415023922920227, + "rewards/symbolic_reward_partial_score/mean": 0.7578287720680237, + "rewards/symbolic_reward_partial_score/std": 0.20313479006290436, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0485944747924805, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 13.493077278137207, + "step": 2117 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31337393820285797, + "epoch": 3.394230769230769, + "grad_norm": 0.024296533316373825, + "learning_rate": 1e-06, + "loss": -0.0042, + "step": 2118 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.30753542482852936, + "epoch": 3.3958333333333335, + "grad_norm": 0.02327045053243637, + "learning_rate": 1e-06, + "loss": -0.0024, + "step": 2119 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.29232731461524963, + "epoch": 3.3974358974358974, + "grad_norm": 0.015478910878300667, + "learning_rate": 1e-06, + "loss": 0.0821, + "step": 2120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2356.0, + "completions/mean_length": 1758.404296875, + "completions/mean_terminated_length": 1526.2520751953125, + "completions/min_length": 753.0, + "completions/min_terminated_length": 753.0, + "entropy": 0.2938903123140335, + "epoch": 3.3990384615384617, + "frac_reward_zero_std": 0.28125, + "grad_norm": 500.75775146484375, + "learning_rate": 1e-06, + "loss": 0.0747, + "num_tokens": 1267013136.0, + "reward": 0.30194687843322754, + "reward_std": 0.0503971166908741, + "rewards/progression_diversity/mean": -0.005022912751883268, + "rewards/progression_diversity/std": 0.04021916538476944, + "rewards/symbolic_reward_accuracy/mean": 0.14453125, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.7201985716819763, + "rewards/symbolic_reward_partial_score/std": 0.20605868101119995, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.051767349243164, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 11.300193786621094, + "step": 2121 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3104073405265808, + "epoch": 3.4006410256410255, + "grad_norm": 0.026015326380729675, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 2122 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30487897992134094, + "epoch": 3.40224358974359, + "grad_norm": 0.022471783682703972, + "learning_rate": 1e-06, + "loss": 0.0026, + "step": 2123 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.299276664853096, + "epoch": 3.4038461538461537, + "grad_norm": 0.022479599341750145, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 2124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3687.0, + "completions/mean_length": 1766.92578125, + "completions/mean_terminated_length": 1564.3128662109375, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "entropy": 0.30581988394260406, + "epoch": 3.405448717948718, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0528433695435524, + "learning_rate": 1e-06, + "loss": 0.037, + "num_tokens": 1268731466.0, + "reward": 0.3584764301776886, + "reward_std": 0.056666046380996704, + "rewards/progression_diversity/mean": -0.003432408208027482, + "rewards/progression_diversity/std": 0.0318170040845871, + "rewards/symbolic_reward_accuracy/mean": 0.234375, + "rewards/symbolic_reward_accuracy/std": 0.42402184009552, + "rewards/symbolic_reward_partial_score/mean": 0.7308430671691895, + "rewards/symbolic_reward_partial_score/std": 0.21932339668273926, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0599393844604492, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 7.578122138977051, + "step": 2125 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3097575902938843, + "epoch": 3.407051282051282, + "grad_norm": 0.023739317432045937, + "learning_rate": 1e-06, + "loss": 0.0415, + "step": 2126 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31125499308109283, + "epoch": 3.4086538461538463, + "grad_norm": 0.012953310273587704, + "learning_rate": 1e-06, + "loss": 0.0353, + "step": 2127 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.31313419342041016, + "epoch": 3.41025641025641, + "grad_norm": 0.013461051508784294, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 2128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2428.0, + "completions/mean_length": 1800.833984375, + "completions/mean_terminated_length": 1539.9024658203125, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "entropy": 0.3056839108467102, + "epoch": 3.4118589743589745, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.035352922976017, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 1270457349.0, + "reward": 0.45632848143577576, + "reward_std": 0.054405439645051956, + "rewards/progression_diversity/mean": -0.005339231342077255, + "rewards/progression_diversity/std": 0.04081280156970024, + "rewards/symbolic_reward_accuracy/mean": 0.359375, + "rewards/symbolic_reward_accuracy/std": 0.48028653860092163, + "rewards/symbolic_reward_partial_score/mean": 0.8057780265808105, + "rewards/symbolic_reward_partial_score/std": 0.20670460164546967, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.054563045501709, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 9.924196243286133, + "step": 2129 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29381316900253296, + "epoch": 3.4134615384615383, + "grad_norm": 11953.2880859375, + "learning_rate": 1e-06, + "loss": 0.3212, + "step": 2130 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2993968427181244, + "epoch": 3.4150641025641026, + "grad_norm": 0.016664206981658936, + "learning_rate": 1e-06, + "loss": 0.0118, + "step": 2131 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.29864759743213654, + "epoch": 3.4166666666666665, + "grad_norm": 1193.7196044921875, + "learning_rate": 1e-06, + "loss": 0.0213, + "step": 2132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2587.0, + "completions/mean_length": 2058.951171875, + "completions/mean_terminated_length": 1506.8701171875, + "completions/min_length": 912.0, + "completions/min_terminated_length": 912.0, + "entropy": 0.2732394188642502, + "epoch": 3.418269230769231, + "frac_reward_zero_std": 0.375, + "grad_norm": 527.1871337890625, + "learning_rate": 1e-06, + "loss": 0.0862, + "num_tokens": 1272399164.0, + "reward": 0.33772745728492737, + "reward_std": 0.049203261733055115, + "rewards/progression_diversity/mean": -0.010945793241262436, + "rewards/progression_diversity/std": 0.0561109222471714, + "rewards/symbolic_reward_accuracy/mean": 0.21484375, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.7062011957168579, + "rewards/symbolic_reward_partial_score/std": 0.2347198873758316, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0237281322479248, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 24.222614288330078, + "step": 2133 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2859821915626526, + "epoch": 3.4198717948717947, + "grad_norm": 0.02516080252826214, + "learning_rate": 1e-06, + "loss": -0.0053, + "step": 2134 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2708042562007904, + "epoch": 3.421474358974359, + "grad_norm": 0.01243208535015583, + "learning_rate": 1e-06, + "loss": 0.077, + "step": 2135 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.26599569618701935, + "epoch": 3.423076923076923, + "grad_norm": 0.008642677217721939, + "learning_rate": 1e-06, + "loss": 0.1548, + "step": 2136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2676.0, + "completions/mean_length": 1775.25, + "completions/mean_terminated_length": 1484.2391357421875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "entropy": 0.2837539613246918, + "epoch": 3.4246794871794872, + "frac_reward_zero_std": 0.4375, + "grad_norm": 216.27127075195312, + "learning_rate": 1e-06, + "loss": 0.0381, + "num_tokens": 1274171948.0, + "reward": 0.28258827328681946, + "reward_std": 0.03927141800522804, + "rewards/progression_diversity/mean": -0.00484531931579113, + "rewards/progression_diversity/std": 0.036202192306518555, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.7051432132720947, + "rewards/symbolic_reward_partial_score/std": 0.19589273631572723, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0403019189834595, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 15.552392959594727, + "step": 2137 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.27445077896118164, + "epoch": 3.426282051282051, + "grad_norm": 0.011864651925861835, + "learning_rate": 1e-06, + "loss": 0.093, + "step": 2138 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2975247800350189, + "epoch": 3.4278846153846154, + "grad_norm": 0.012054034508764744, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 2139 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.2823929637670517, + "epoch": 3.4294871794871793, + "grad_norm": 0.015474159270524979, + "learning_rate": 1e-06, + "loss": 0.0769, + "step": 2140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2296.0, + "completions/mean_length": 1778.083984375, + "completions/mean_terminated_length": 1516.7454833984375, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "entropy": 0.28506043553352356, + "epoch": 3.4310897435897436, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.018360108137130737, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 1275971479.0, + "reward": 0.3357412815093994, + "reward_std": 0.02756696194410324, + "rewards/progression_diversity/mean": -0.004975297022610903, + "rewards/progression_diversity/std": 0.03728037327528, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.7488607168197632, + "rewards/symbolic_reward_partial_score/std": 0.1986795961856842, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0461030006408691, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 12.876171112060547, + "step": 2141 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2805647701025009, + "epoch": 3.4326923076923075, + "grad_norm": 0.014698946848511696, + "learning_rate": 1e-06, + "loss": 0.0504, + "step": 2142 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.286907359957695, + "epoch": 3.434294871794872, + "grad_norm": 0.020853351801633835, + "learning_rate": 1e-06, + "loss": 0.0685, + "step": 2143 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.28968897461891174, + "epoch": 3.435897435897436, + "grad_norm": 0.010319734923541546, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 2144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2501.0, + "completions/mean_length": 1784.212890625, + "completions/mean_terminated_length": 1522.9840087890625, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "entropy": 0.3004123270511627, + "epoch": 3.4375, + "frac_reward_zero_std": 0.375, + "grad_norm": 258.38421630859375, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 1277659044.0, + "reward": 0.4109523296356201, + "reward_std": 0.08106425404548645, + "rewards/progression_diversity/mean": -0.005354299675673246, + "rewards/progression_diversity/std": 0.04184706509113312, + "rewards/symbolic_reward_accuracy/mean": 0.306640625, + "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, + "rewards/symbolic_reward_partial_score/mean": 0.7619466781616211, + "rewards/symbolic_reward_partial_score/std": 0.23045581579208374, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0512118339538574, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 11.705215454101562, + "step": 2145 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.29704779386520386, + "epoch": 3.439102564102564, + "grad_norm": 0.023789308965206146, + "learning_rate": 1e-06, + "loss": 0.0442, + "step": 2146 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.29680609703063965, + "epoch": 3.440705128205128, + "grad_norm": 0.022177523002028465, + "learning_rate": 1e-06, + "loss": 0.038, + "step": 2147 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.29306888580322266, + "epoch": 3.4423076923076925, + "grad_norm": 0.009412667714059353, + "learning_rate": 1e-06, + "loss": 0.058, + "step": 2148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2385.0, + "completions/mean_length": 1976.712890625, + "completions/mean_terminated_length": 1571.688720703125, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.2784651964902878, + "epoch": 3.4439102564102564, + "frac_reward_zero_std": 0.375, + "grad_norm": 62.723731994628906, + "learning_rate": 1e-06, + "loss": 0.0358, + "num_tokens": 1279531361.0, + "reward": 0.4271858334541321, + "reward_std": 0.03608609363436699, + "rewards/progression_diversity/mean": -0.008470406755805016, + "rewards/progression_diversity/std": 0.05123213678598404, + "rewards/symbolic_reward_accuracy/mean": 0.3359375, + "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, + "rewards/symbolic_reward_partial_score/mean": 0.7549641728401184, + "rewards/symbolic_reward_partial_score/std": 0.22292669117450714, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0334545373916626, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 19.96367835998535, + "step": 2149 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.28350482881069183, + "epoch": 3.4455128205128207, + "grad_norm": 0.010959633626043797, + "learning_rate": 1e-06, + "loss": 0.07, + "step": 2150 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2834584712982178, + "epoch": 3.4471153846153846, + "grad_norm": 0.014038468711078167, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 2151 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.27503225207328796, + "epoch": 3.448717948717949, + "grad_norm": 0.03383004292845726, + "learning_rate": 1e-06, + "loss": 0.0429, + "step": 2152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2406.0, + "completions/mean_length": 2020.60546875, + "completions/mean_terminated_length": 1557.2701416015625, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "entropy": 0.2737235426902771, + "epoch": 3.4503205128205128, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.028786683455109596, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 1281467799.0, + "reward": 0.34394705295562744, + "reward_std": 0.05187678337097168, + "rewards/progression_diversity/mean": -0.009591124951839447, + "rewards/progression_diversity/std": 0.05538209527730942, + "rewards/symbolic_reward_accuracy/mean": 0.201171875, + "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, + "rewards/symbolic_reward_partial_score/mean": 0.7490234375, + "rewards/symbolic_reward_partial_score/std": 0.20375196635723114, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0315946340560913, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 20.09771728515625, + "step": 2153 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2721341699361801, + "epoch": 3.451923076923077, + "grad_norm": 0.00952248927205801, + "learning_rate": 1e-06, + "loss": 0.1071, + "step": 2154 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.27051886916160583, + "epoch": 3.453525641025641, + "grad_norm": 0.015221396461129189, + "learning_rate": 1e-06, + "loss": 0.0418, + "step": 2155 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.26560334861278534, + "epoch": 3.4551282051282053, + "grad_norm": 0.010249440558254719, + "learning_rate": 1e-06, + "loss": 0.0637, + "step": 2156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2303.0, + "completions/mean_length": 2162.498046875, + "completions/mean_terminated_length": 1584.38818359375, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "entropy": 0.266851544380188, + "epoch": 3.456730769230769, + "frac_reward_zero_std": 0.28125, + "grad_norm": 609.803955078125, + "learning_rate": 1e-06, + "loss": 0.072, + "num_tokens": 1283476326.0, + "reward": 0.39281851053237915, + "reward_std": 0.035197146236896515, + "rewards/progression_diversity/mean": -0.013561811298131943, + "rewards/progression_diversity/std": 0.06690501421689987, + "rewards/symbolic_reward_accuracy/mean": 0.275390625, + "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, + "rewards/symbolic_reward_partial_score/mean": 0.7610189318656921, + "rewards/symbolic_reward_partial_score/std": 0.19838300347328186, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0253196954727173, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 22.992713928222656, + "step": 2157 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.269494891166687, + "epoch": 3.4583333333333335, + "grad_norm": 0.024904318153858185, + "learning_rate": 1e-06, + "loss": 0.0599, + "step": 2158 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.25990206003189087, + "epoch": 3.4599358974358974, + "grad_norm": 0.01417806651443243, + "learning_rate": 1e-06, + "loss": 0.0332, + "step": 2159 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2642335593700409, + "epoch": 3.4615384615384617, + "grad_norm": 0.00956464372575283, + "learning_rate": 1e-06, + "loss": 0.0442, + "step": 2160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2432.0, + "completions/mean_length": 2211.890625, + "completions/mean_terminated_length": 1575.591796875, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 0.2689553052186966, + "epoch": 3.4631410256410255, + "frac_reward_zero_std": 0.28125, + "grad_norm": 659.0357666015625, + "learning_rate": 1e-06, + "loss": 0.0529, + "num_tokens": 1285388046.0, + "reward": 0.28991633653640747, + "reward_std": 0.031822673976421356, + "rewards/progression_diversity/mean": -0.012760378420352936, + "rewards/progression_diversity/std": 0.0614657998085022, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.7272297739982605, + "rewards/symbolic_reward_partial_score/std": 0.18316437304019928, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.018622875213623, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 27.381484985351562, + "step": 2161 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.26476864516735077, + "epoch": 3.46474358974359, + "grad_norm": 0.013940541073679924, + "learning_rate": 1e-06, + "loss": 0.1169, + "step": 2162 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2731598913669586, + "epoch": 3.4663461538461537, + "grad_norm": 77.1856918334961, + "learning_rate": 1e-06, + "loss": 0.0693, + "step": 2163 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2830631732940674, + "epoch": 3.467948717948718, + "grad_norm": 0.026339799165725708, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 2164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2373.0, + "completions/mean_length": 2145.0546875, + "completions/mean_terminated_length": 1566.2357177734375, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "entropy": 0.27289319038391113, + "epoch": 3.469551282051282, + "frac_reward_zero_std": 0.34375, + "grad_norm": 247.32151794433594, + "learning_rate": 1e-06, + "loss": 0.0502, + "num_tokens": 1287364458.0, + "reward": 0.4520633816719055, + "reward_std": 0.041082873940467834, + "rewards/progression_diversity/mean": -0.013879441656172276, + "rewards/progression_diversity/std": 0.06810557842254639, + "rewards/symbolic_reward_accuracy/mean": 0.3671875, + "rewards/symbolic_reward_accuracy/std": 0.48250964283943176, + "rewards/symbolic_reward_partial_score/mean": 0.7729655504226685, + "rewards/symbolic_reward_partial_score/std": 0.21098491549491882, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.033386468887329, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 19.480430603027344, + "step": 2165 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2645433694124222, + "epoch": 3.4711538461538463, + "grad_norm": 0.010487830266356468, + "learning_rate": 1e-06, + "loss": 0.081, + "step": 2166 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2667945772409439, + "epoch": 3.47275641025641, + "grad_norm": 0.012948272749781609, + "learning_rate": 1e-06, + "loss": 0.0463, + "step": 2167 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2764432579278946, + "epoch": 3.4743589743589745, + "grad_norm": 0.016345176845788956, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 2168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2373.0, + "completions/mean_length": 2468.5, + "completions/mean_terminated_length": 1540.800048828125, + "completions/min_length": 993.0, + "completions/min_terminated_length": 993.0, + "entropy": 0.2626621425151825, + "epoch": 3.4759615384615383, + "frac_reward_zero_std": 0.21875, + "grad_norm": 77.64374542236328, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 1289573546.0, + "reward": 0.23555369675159454, + "reward_std": 0.02879556640982628, + "rewards/progression_diversity/mean": -0.02080235444009304, + "rewards/progression_diversity/std": 0.08193078637123108, + "rewards/symbolic_reward_accuracy/mean": 0.05859375, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.6706380844116211, + "rewards/symbolic_reward_partial_score/std": 0.17417360842227936, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0140269994735718, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 29.442203521728516, + "step": 2169 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.25798384845256805, + "epoch": 3.4775641025641026, + "grad_norm": 9.645312309265137, + "learning_rate": 1e-06, + "loss": 0.0937, + "step": 2170 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.25725966691970825, + "epoch": 3.4791666666666665, + "grad_norm": 0.025352254509925842, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 2171 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.26060421764850616, + "epoch": 3.480769230769231, + "grad_norm": 0.012905375100672245, + "learning_rate": 1e-06, + "loss": 0.0643, + "step": 2172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2228.0, + "completions/mean_length": 2100.515625, + "completions/mean_terminated_length": 1550.0364990234375, + "completions/min_length": 825.0, + "completions/min_terminated_length": 825.0, + "entropy": 0.2711750864982605, + "epoch": 3.4823717948717947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 3850.08984375, + "learning_rate": 1e-06, + "loss": 0.0719, + "num_tokens": 1291521378.0, + "reward": 0.2990831136703491, + "reward_std": 0.02100459486246109, + "rewards/progression_diversity/mean": -0.012100995518267155, + "rewards/progression_diversity/std": 0.06185305863618851, + "rewards/symbolic_reward_accuracy/mean": 0.123046875, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.7525553107261658, + "rewards/symbolic_reward_partial_score/std": 0.1686214655637741, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.03285813331604, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 19.567764282226562, + "step": 2173 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.26994919776916504, + "epoch": 3.483974358974359, + "grad_norm": 0.02370203286409378, + "learning_rate": 1e-06, + "loss": 0.039, + "step": 2174 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.28110212087631226, + "epoch": 3.485576923076923, + "grad_norm": 0.02028624340891838, + "learning_rate": 1e-06, + "loss": 0.0025, + "step": 2175 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2572477161884308, + "epoch": 3.4871794871794872, + "grad_norm": 0.007402379531413317, + "learning_rate": 1e-06, + "loss": 0.0899, + "step": 2176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2413.0, + "completions/mean_length": 1678.212890625, + "completions/mean_terminated_length": 1533.1854248046875, + "completions/min_length": 979.0, + "completions/min_terminated_length": 979.0, + "entropy": 0.30332140624523163, + "epoch": 3.488782051282051, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.013128817081451416, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 1293154063.0, + "reward": 0.4060882329940796, + "reward_std": 0.014026455581188202, + "rewards/progression_diversity/mean": -0.0029925985727459192, + "rewards/progression_diversity/std": 0.02976119890809059, + "rewards/symbolic_reward_accuracy/mean": 0.310546875, + "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, + "rewards/symbolic_reward_partial_score/mean": 0.7332844734191895, + "rewards/symbolic_reward_partial_score/std": 0.22508755326271057, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0598652362823486, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 6.740238666534424, + "step": 2177 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.2928498536348343, + "epoch": 3.4903846153846154, + "grad_norm": 1097.95703125, + "learning_rate": 1e-06, + "loss": 0.0812, + "step": 2178 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.29989422857761383, + "epoch": 3.4919871794871793, + "grad_norm": 0.009113781154155731, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 2179 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29661744832992554, + "epoch": 3.4935897435897436, + "grad_norm": 0.041792068630456924, + "learning_rate": 1e-06, + "loss": 0.0276, + "step": 2180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2569.0, + "completions/mean_length": 1740.89453125, + "completions/mean_terminated_length": 1567.260986328125, + "completions/min_length": 994.0, + "completions/min_terminated_length": 994.0, + "entropy": 0.28554295003414154, + "epoch": 3.4951923076923075, + "frac_reward_zero_std": 0.46875, + "grad_norm": 380.84259033203125, + "learning_rate": 1e-06, + "loss": 0.0355, + "num_tokens": 1294866057.0, + "reward": 0.30031508207321167, + "reward_std": 0.020207438617944717, + "rewards/progression_diversity/mean": -0.004136052913963795, + "rewards/progression_diversity/std": 0.03724845126271248, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.6932454109191895, + "rewards/symbolic_reward_partial_score/std": 0.20394784212112427, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0567407608032227, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 7.55679988861084, + "step": 2181 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2933173179626465, + "epoch": 3.496794871794872, + "grad_norm": 0.02578102797269821, + "learning_rate": 1e-06, + "loss": 0.0043, + "step": 2182 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2895757555961609, + "epoch": 3.498397435897436, + "grad_norm": 0.016278348863124847, + "learning_rate": 1e-06, + "loss": -0.004, + "step": 2183 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2855415791273117, + "epoch": 3.5, + "grad_norm": 0.013905326835811138, + "learning_rate": 1e-06, + "loss": 0.0135, + "step": 2184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2640.0, + "completions/mean_length": 1967.490234375, + "completions/mean_terminated_length": 1621.4940185546875, + "completions/min_length": 1021.0, + "completions/min_terminated_length": 1021.0, + "entropy": 0.2863430380821228, + "epoch": 3.501602564102564, + "frac_reward_zero_std": 0.4375, + "grad_norm": 103.70210266113281, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 1296736836.0, + "reward": 0.31307274103164673, + "reward_std": 0.031172282993793488, + "rewards/progression_diversity/mean": -0.0066928681917488575, + "rewards/progression_diversity/std": 0.04427814856171608, + "rewards/symbolic_reward_accuracy/mean": 0.162109375, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.7208821177482605, + "rewards/symbolic_reward_partial_score/std": 0.19237685203552246, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0420305728912354, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 15.591737747192383, + "step": 2185 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2772316485643387, + "epoch": 3.503205128205128, + "grad_norm": 268.4742736816406, + "learning_rate": 1e-06, + "loss": 0.0571, + "step": 2186 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2852557599544525, + "epoch": 3.5048076923076925, + "grad_norm": 0.01805300824344158, + "learning_rate": 1e-06, + "loss": 0.0119, + "step": 2187 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.288587287068367, + "epoch": 3.5064102564102564, + "grad_norm": 0.017689945176243782, + "learning_rate": 1e-06, + "loss": 0.0324, + "step": 2188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2846.0, + "completions/mean_length": 1944.544921875, + "completions/mean_terminated_length": 1686.184814453125, + "completions/min_length": 1121.0, + "completions/min_terminated_length": 1121.0, + "entropy": 0.27069567143917084, + "epoch": 3.5080128205128203, + "frac_reward_zero_std": 0.5625, + "grad_norm": 196.1943359375, + "learning_rate": 1e-06, + "loss": 0.0453, + "num_tokens": 1298707867.0, + "reward": 0.40786558389663696, + "reward_std": 0.02084019035100937, + "rewards/progression_diversity/mean": -0.004458786454051733, + "rewards/progression_diversity/std": 0.03238670900464058, + "rewards/symbolic_reward_accuracy/mean": 0.30859375, + "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, + "rewards/symbolic_reward_partial_score/mean": 0.7438150644302368, + "rewards/symbolic_reward_partial_score/std": 0.21741990745067596, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.052014708518982, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 9.943605422973633, + "step": 2189 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.27081412076950073, + "epoch": 3.5096153846153846, + "grad_norm": 0.013844393193721771, + "learning_rate": 1e-06, + "loss": 0.3136, + "step": 2190 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2878330200910568, + "epoch": 3.511217948717949, + "grad_norm": 0.013577915728092194, + "learning_rate": 1e-06, + "loss": -0.0015, + "step": 2191 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.27837416529655457, + "epoch": 3.5128205128205128, + "grad_norm": 0.009212753735482693, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3301.0, + "completions/mean_length": 2025.873046875, + "completions/mean_terminated_length": 1681.278076171875, + "completions/min_length": 1087.0, + "completions/min_terminated_length": 1087.0, + "entropy": 0.277935653924942, + "epoch": 3.5144230769230766, + "frac_reward_zero_std": 0.375, + "grad_norm": 408.2825927734375, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 1300680634.0, + "reward": 0.3115173280239105, + "reward_std": 0.014852182939648628, + "rewards/progression_diversity/mean": -0.005984636954963207, + "rewards/progression_diversity/std": 0.039667367935180664, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.7267415523529053, + "rewards/symbolic_reward_partial_score/std": 0.1806911826133728, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0477591753005981, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 12.26329231262207, + "step": 2193 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2869657427072525, + "epoch": 3.516025641025641, + "grad_norm": 0.009542958810925484, + "learning_rate": 1e-06, + "loss": 5.8881, + "step": 2194 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2763095796108246, + "epoch": 3.5176282051282053, + "grad_norm": 68490.9453125, + "learning_rate": 1e-06, + "loss": 14.7655, + "step": 2195 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.28788307309150696, + "epoch": 3.519230769230769, + "grad_norm": 0.022392556071281433, + "learning_rate": 1e-06, + "loss": 3.6531, + "step": 2196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2589.0, + "completions/mean_length": 1910.25390625, + "completions/mean_terminated_length": 1680.511962890625, + "completions/min_length": 1155.0, + "completions/min_terminated_length": 1155.0, + "entropy": 0.2990610897541046, + "epoch": 3.5208333333333335, + "frac_reward_zero_std": 0.53125, + "grad_norm": 134.34463500976562, + "learning_rate": 1e-06, + "loss": 0.0237, + "num_tokens": 1302494364.0, + "reward": 0.38631004095077515, + "reward_std": 0.027447409927845, + "rewards/progression_diversity/mean": -0.004252912942320108, + "rewards/progression_diversity/std": 0.033583469688892365, + "rewards/symbolic_reward_accuracy/mean": 0.275390625, + "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, + "rewards/symbolic_reward_partial_score/mean": 0.7383626699447632, + "rewards/symbolic_reward_partial_score/std": 0.19246533513069153, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0586574077606201, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 7.78310489654541, + "step": 2197 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30396056175231934, + "epoch": 3.5224358974358974, + "grad_norm": 0.016528572887182236, + "learning_rate": 1e-06, + "loss": -0.0008, + "step": 2198 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.308067262172699, + "epoch": 3.5240384615384617, + "grad_norm": 0.01760723628103733, + "learning_rate": 1e-06, + "loss": -0.0059, + "step": 2199 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2869449555873871, + "epoch": 3.5256410256410255, + "grad_norm": 835913.125, + "learning_rate": 1e-06, + "loss": 89.1969, + "step": 2200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2777.0, + "completions/mean_length": 1783.34375, + "completions/mean_terminated_length": 1668.3779296875, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "entropy": 0.3045912981033325, + "epoch": 3.52724358974359, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.0315437950193882, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 1304211468.0, + "reward": 0.38467174768447876, + "reward_std": 0.01725323125720024, + "rewards/progression_diversity/mean": -0.002066644374281168, + "rewards/progression_diversity/std": 0.023847082629799843, + "rewards/symbolic_reward_accuracy/mean": 0.251953125, + "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, + "rewards/symbolic_reward_partial_score/mean": 0.779052734375, + "rewards/symbolic_reward_partial_score/std": 0.19922928512096405, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0659763813018799, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 4.61309814453125, + "step": 2201 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.29725050926208496, + "epoch": 3.5288461538461537, + "grad_norm": 1866.8873291015625, + "learning_rate": 1e-06, + "loss": 0.1574, + "step": 2202 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2994164824485779, + "epoch": 3.530448717948718, + "grad_norm": 0.00927357655018568, + "learning_rate": 1e-06, + "loss": -0.0, + "step": 2203 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.30384716391563416, + "epoch": 3.532051282051282, + "grad_norm": 0.011953097768127918, + "learning_rate": 1e-06, + "loss": -0.0006, + "step": 2204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3051.0, + "completions/mean_length": 1808.693359375, + "completions/mean_terminated_length": 1635.8636474609375, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.2852201759815216, + "epoch": 3.5336538461538463, + "frac_reward_zero_std": 0.625, + "grad_norm": 128.33778381347656, + "learning_rate": 1e-06, + "loss": 0.0167, + "num_tokens": 1306004719.0, + "reward": 0.3015816807746887, + "reward_std": 0.009979461319744587, + "rewards/progression_diversity/mean": -0.001987605821341276, + "rewards/progression_diversity/std": 0.02013721875846386, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.7566406726837158, + "rewards/symbolic_reward_partial_score/std": 0.17199444770812988, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0611894130706787, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 5.849099636077881, + "step": 2205 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2893652319908142, + "epoch": 3.53525641025641, + "grad_norm": 0.021172404289245605, + "learning_rate": 1e-06, + "loss": 0.0304, + "step": 2206 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.30061855912208557, + "epoch": 3.5368589743589745, + "grad_norm": 0.007474346086382866, + "learning_rate": 1e-06, + "loss": -0.0081, + "step": 2207 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.28687790036201477, + "epoch": 3.5384615384615383, + "grad_norm": 0.008998925797641277, + "learning_rate": 1e-06, + "loss": 0.063, + "step": 2208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2967.0, + "completions/mean_length": 1647.38671875, + "completions/mean_terminated_length": 1589.59619140625, + "completions/min_length": 1078.0, + "completions/min_terminated_length": 1078.0, + "entropy": 0.2984195798635483, + "epoch": 3.5400641025641026, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.022925743833184242, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 1307736341.0, + "reward": 0.3372430205345154, + "reward_std": 0.02868266962468624, + "rewards/progression_diversity/mean": -0.0008010210585780442, + "rewards/progression_diversity/std": 0.012307359836995602, + "rewards/symbolic_reward_accuracy/mean": 0.193359375, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.7381021976470947, + "rewards/symbolic_reward_partial_score/std": 0.18334569036960602, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0685679912567139, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 2.327254056930542, + "step": 2209 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.292475089430809, + "epoch": 3.5416666666666665, + "grad_norm": 0.010455112904310226, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 2210 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.2838650196790695, + "epoch": 3.543269230769231, + "grad_norm": 0.0170154869556427, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 2211 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2912348359823227, + "epoch": 3.5448717948717947, + "grad_norm": 0.014277713373303413, + "learning_rate": 1e-06, + "loss": -0.001, + "step": 2212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2683.0, + "completions/mean_length": 1732.833984375, + "completions/mean_terminated_length": 1588.34521484375, + "completions/min_length": 1037.0, + "completions/min_terminated_length": 1037.0, + "entropy": 0.29485444724559784, + "epoch": 3.546474358974359, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.012255196459591389, + "learning_rate": 1e-06, + "loss": 0.031, + "num_tokens": 1309492480.0, + "reward": 0.40414494276046753, + "reward_std": 0.017114289104938507, + "rewards/progression_diversity/mean": -0.002500717993825674, + "rewards/progression_diversity/std": 0.025870388373732567, + "rewards/symbolic_reward_accuracy/mean": 0.27734375, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.7931965589523315, + "rewards/symbolic_reward_partial_score/std": 0.17990581691265106, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0640480518341064, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 4.959805011749268, + "step": 2213 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2949848175048828, + "epoch": 3.5480769230769234, + "grad_norm": 0.023037033155560493, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 2214 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2907959371805191, + "epoch": 3.5496794871794872, + "grad_norm": 0.017563870176672935, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 2215 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.29562048614025116, + "epoch": 3.551282051282051, + "grad_norm": 0.008448651060461998, + "learning_rate": 1e-06, + "loss": -0.0048, + "step": 2216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2515.0, + "completions/mean_length": 1673.904296875, + "completions/mean_terminated_length": 1528.8343505859375, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "entropy": 0.2969505935907364, + "epoch": 3.5528846153846154, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.011285161599516869, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 1311276607.0, + "reward": 0.37819382548332214, + "reward_std": 0.02540595829486847, + "rewards/progression_diversity/mean": -0.0028848343063145876, + "rewards/progression_diversity/std": 0.028827061876654625, + "rewards/symbolic_reward_accuracy/mean": 0.2421875, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.7763671875, + "rewards/symbolic_reward_partial_score/std": 0.18114922940731049, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0641391277313232, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 5.106628894805908, + "step": 2217 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29557299613952637, + "epoch": 3.5544871794871797, + "grad_norm": 3402.72705078125, + "learning_rate": 1e-06, + "loss": 0.0937, + "step": 2218 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2919696867465973, + "epoch": 3.5560897435897436, + "grad_norm": 0.012350378558039665, + "learning_rate": 1e-06, + "loss": -0.0099, + "step": 2219 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.28622542321681976, + "epoch": 3.5576923076923075, + "grad_norm": 0.01743849739432335, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 2220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2304.0, + "completions/mean_length": 1677.265625, + "completions/mean_terminated_length": 1532.228759765625, + "completions/min_length": 989.0, + "completions/min_terminated_length": 989.0, + "entropy": 0.3065932095050812, + "epoch": 3.559294871794872, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.021933453157544136, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 1312952407.0, + "reward": 0.42246508598327637, + "reward_std": 0.04646175354719162, + "rewards/progression_diversity/mean": -0.0020300918258726597, + "rewards/progression_diversity/std": 0.02212829887866974, + "rewards/symbolic_reward_accuracy/mean": 0.314453125, + "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, + "rewards/symbolic_reward_partial_score/mean": 0.780029296875, + "rewards/symbolic_reward_partial_score/std": 0.20408369600772858, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0639897584915161, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 5.478954792022705, + "step": 2221 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3041534572839737, + "epoch": 3.560897435897436, + "grad_norm": 0.019840234890580177, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 2222 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.299197256565094, + "epoch": 3.5625, + "grad_norm": 0.015518907457590103, + "learning_rate": 1e-06, + "loss": 0.012, + "step": 2223 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.29240258038043976, + "epoch": 3.564102564102564, + "grad_norm": 0.017431948333978653, + "learning_rate": 1e-06, + "loss": 0.069, + "step": 2224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2300.0, + "completions/mean_length": 1671.21484375, + "completions/mean_terminated_length": 1555.3660888671875, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "entropy": 0.2979786843061447, + "epoch": 3.565705128205128, + "frac_reward_zero_std": 0.6875, + "grad_norm": 362.736572265625, + "learning_rate": 1e-06, + "loss": 0.0251, + "num_tokens": 1314672565.0, + "reward": 0.35204920172691345, + "reward_std": 0.017848532646894455, + "rewards/progression_diversity/mean": -0.002111406996846199, + "rewards/progression_diversity/std": 0.024554969742894173, + "rewards/symbolic_reward_accuracy/mean": 0.21484375, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.7445312738418579, + "rewards/symbolic_reward_partial_score/std": 0.18352799117565155, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0668435096740723, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 4.025793075561523, + "step": 2225 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.29960186779499054, + "epoch": 3.5673076923076925, + "grad_norm": 0.014206199906766415, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 2226 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.29641352593898773, + "epoch": 3.5689102564102564, + "grad_norm": 0.013473715633153915, + "learning_rate": 1e-06, + "loss": 0.0365, + "step": 2227 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.29800280928611755, + "epoch": 3.5705128205128203, + "grad_norm": 0.01775544323027134, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 2228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2511.0, + "completions/mean_length": 1592.27734375, + "completions/mean_terminated_length": 1534.2706298828125, + "completions/min_length": 928.0, + "completions/min_terminated_length": 928.0, + "entropy": 0.2977886497974396, + "epoch": 3.5721153846153846, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1049.2088623046875, + "learning_rate": 1e-06, + "loss": 0.0501, + "num_tokens": 1316468131.0, + "reward": 0.3352065682411194, + "reward_std": 0.040253110229969025, + "rewards/progression_diversity/mean": -0.000830255274195224, + "rewards/progression_diversity/std": 0.015343528240919113, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.7475911378860474, + "rewards/symbolic_reward_partial_score/std": 0.184952974319458, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0706126689910889, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 2.3357362747192383, + "step": 2229 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.2995491325855255, + "epoch": 3.573717948717949, + "grad_norm": 0.029800059273838997, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 2230 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.30069583654403687, + "epoch": 3.5753205128205128, + "grad_norm": 0.02240557037293911, + "learning_rate": 1e-06, + "loss": -0.0074, + "step": 2231 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.29591603577136993, + "epoch": 3.5769230769230766, + "grad_norm": 0.020445875823497772, + "learning_rate": 1e-06, + "loss": 0.003, + "step": 2232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2225.0, + "completions/max_terminated_length": 2225.0, + "completions/mean_length": 1476.59765625, + "completions/mean_terminated_length": 1476.59765625, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "entropy": 0.3097764253616333, + "epoch": 3.578525641025641, + "frac_reward_zero_std": 0.90625, + "grad_norm": 0.009613445959985256, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 1318013077.0, + "reward": 0.2948632836341858, + "reward_std": 0.003713551675900817, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.7328775525093079, + "rewards/symbolic_reward_partial_score/std": 0.16866321861743927, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0766642093658447, + "sampling/importance_sampling_ratio/min": 5.8294663176639006e-05, + "sampling/sampling_logp_difference/max": 9.75, + "sampling/sampling_logp_difference/mean": 0.14255455136299133, + "step": 2233 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0234375, + "entropy": 0.31367750465869904, + "epoch": 3.5801282051282053, + "grad_norm": 0.011548890732228756, + "learning_rate": 1e-06, + "loss": -0.0032, + "step": 2234 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.046875, + "entropy": 0.3107854425907135, + "epoch": 3.581730769230769, + "grad_norm": 0.005681135691702366, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 2235 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.313510924577713, + "epoch": 3.5833333333333335, + "grad_norm": 0.0011630782391875982, + "learning_rate": 1e-06, + "loss": 0.0037, + "step": 2236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2139.0, + "completions/max_terminated_length": 2139.0, + "completions/mean_length": 1488.98828125, + "completions/mean_terminated_length": 1488.98828125, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "entropy": 0.3215675801038742, + "epoch": 3.5849358974358974, + "frac_reward_zero_std": 0.84375, + "grad_norm": 0.008507279679179192, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 1319585743.0, + "reward": 0.3924804925918579, + "reward_std": 0.01995375007390976, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.259765625, + "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, + "rewards/symbolic_reward_partial_score/mean": 0.7887369990348816, + "rewards/symbolic_reward_partial_score/std": 0.16583316028118134, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.078096866607666, + "sampling/importance_sampling_ratio/min": 0.0001314402325078845, + "sampling/sampling_logp_difference/max": 8.936958312988281, + "sampling/sampling_logp_difference/mean": 0.14448994398117065, + "step": 2237 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.31759844720363617, + "epoch": 3.5865384615384617, + "grad_norm": 0.005916442256420851, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 2238 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0625, + "entropy": 0.31728270649909973, + "epoch": 3.5881410256410255, + "grad_norm": 0.01380041241645813, + "learning_rate": 1e-06, + "loss": -0.0004, + "step": 2239 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0703125, + "entropy": 0.3198378384113312, + "epoch": 3.58974358974359, + "grad_norm": 0.011921794153749943, + "learning_rate": 1e-06, + "loss": -0.0002, + "step": 2240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2938.0, + "completions/mean_length": 1711.978515625, + "completions/mean_terminated_length": 1508.60400390625, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "entropy": 0.29430273175239563, + "epoch": 3.5913461538461537, + "frac_reward_zero_std": 0.46875, + "grad_norm": 607.2608032226562, + "learning_rate": 1e-06, + "loss": 0.0939, + "num_tokens": 1321497412.0, + "reward": 0.3224715292453766, + "reward_std": 0.018792927265167236, + "rewards/progression_diversity/mean": -0.0033359499648213387, + "rewards/progression_diversity/std": 0.029082374647259712, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.7052246332168579, + "rewards/symbolic_reward_partial_score/std": 0.20231178402900696, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0549734830856323, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 11.582987785339355, + "step": 2241 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3087223768234253, + "epoch": 3.592948717948718, + "grad_norm": 0.012669588439166546, + "learning_rate": 1e-06, + "loss": -0.0031, + "step": 2242 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3084537237882614, + "epoch": 3.594551282051282, + "grad_norm": 0.02636776864528656, + "learning_rate": 1e-06, + "loss": 0.0134, + "step": 2243 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3076443672180176, + "epoch": 3.5961538461538463, + "grad_norm": 94.45758056640625, + "learning_rate": 1e-06, + "loss": 0.0224, + "step": 2244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3366.0, + "completions/mean_length": 1752.544921875, + "completions/mean_terminated_length": 1549.732666015625, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "entropy": 0.31993138790130615, + "epoch": 3.59775641025641, + "frac_reward_zero_std": 0.59375, + "grad_norm": 107.91487121582031, + "learning_rate": 1e-06, + "loss": 0.0304, + "num_tokens": 1323181931.0, + "reward": 0.44030338525772095, + "reward_std": 0.029550693929195404, + "rewards/progression_diversity/mean": -0.00335341296158731, + "rewards/progression_diversity/std": 0.03010004572570324, + "rewards/symbolic_reward_accuracy/mean": 0.3359375, + "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, + "rewards/symbolic_reward_partial_score/mean": 0.7978678345680237, + "rewards/symbolic_reward_partial_score/std": 0.19323518872261047, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059898018836975, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 10.545293807983398, + "step": 2245 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3280244767665863, + "epoch": 3.5993589743589745, + "grad_norm": 0.02449607476592064, + "learning_rate": 1e-06, + "loss": 0.0009, + "step": 2246 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.32079842686653137, + "epoch": 3.6009615384615383, + "grad_norm": 0.018285367637872696, + "learning_rate": 1e-06, + "loss": 0.059, + "step": 2247 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.32340890169143677, + "epoch": 3.6025641025641026, + "grad_norm": 0.00783622357994318, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 2248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2539.0, + "completions/mean_length": 1838.962890625, + "completions/mean_terminated_length": 1578.713623046875, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "entropy": 0.3316734880208969, + "epoch": 3.6041666666666665, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.011173716746270657, + "learning_rate": 1e-06, + "loss": 0.0217, + "num_tokens": 1324944408.0, + "reward": 0.37416577339172363, + "reward_std": 0.039710745215415955, + "rewards/progression_diversity/mean": -0.004324798006564379, + "rewards/progression_diversity/std": 0.033225029706954956, + "rewards/symbolic_reward_accuracy/mean": 0.240234375, + "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, + "rewards/symbolic_reward_partial_score/mean": 0.7694987058639526, + "rewards/symbolic_reward_partial_score/std": 0.2098463922739029, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0569475889205933, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 12.69996452331543, + "step": 2249 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31876716017723083, + "epoch": 3.605769230769231, + "grad_norm": 0.010406982153654099, + "learning_rate": 1e-06, + "loss": 0.0613, + "step": 2250 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.322706013917923, + "epoch": 3.6073717948717947, + "grad_norm": 8.13254451751709, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 2251 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.32810820639133453, + "epoch": 3.608974358974359, + "grad_norm": 0.506855845451355, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2847.0, + "completions/mean_length": 1849.2734375, + "completions/mean_terminated_length": 1589.2086181640625, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "entropy": 0.3225103169679642, + "epoch": 3.6105769230769234, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.011867878027260303, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 1326873332.0, + "reward": 0.24351388216018677, + "reward_std": 0.017110900953412056, + "rewards/progression_diversity/mean": -0.004082350060343742, + "rewards/progression_diversity/std": 0.03328991308808327, + "rewards/symbolic_reward_accuracy/mean": 0.0625, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.689453125, + "rewards/symbolic_reward_partial_score/std": 0.1663842499256134, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0547860860824585, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 12.677425384521484, + "step": 2253 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.31891578435897827, + "epoch": 3.6121794871794872, + "grad_norm": 250185.734375, + "learning_rate": 1e-06, + "loss": 26.996, + "step": 2254 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.32373523712158203, + "epoch": 3.613782051282051, + "grad_norm": 46074924.0, + "learning_rate": 1e-06, + "loss": 2780.4724, + "step": 2255 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.32337436079978943, + "epoch": 3.6153846153846154, + "grad_norm": 0.01320252288132906, + "learning_rate": 1e-06, + "loss": 2.3287, + "step": 2256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2528.0, + "completions/mean_length": 1806.265625, + "completions/mean_terminated_length": 1604.1981201171875, + "completions/min_length": 986.0, + "completions/min_terminated_length": 986.0, + "entropy": 0.3251884877681732, + "epoch": 3.6169871794871797, + "frac_reward_zero_std": 0.6875, + "grad_norm": 670.6405029296875, + "learning_rate": 1e-06, + "loss": 0.0407, + "num_tokens": 1328659420.0, + "reward": 0.4107644557952881, + "reward_std": 0.033720072358846664, + "rewards/progression_diversity/mean": -0.003146503819152713, + "rewards/progression_diversity/std": 0.02718130685389042, + "rewards/symbolic_reward_accuracy/mean": 0.28125, + "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, + "rewards/symbolic_reward_partial_score/mean": 0.8081217408180237, + "rewards/symbolic_reward_partial_score/std": 0.1699947714805603, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.063145637512207, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 9.102170944213867, + "step": 2257 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3352958559989929, + "epoch": 3.6185897435897436, + "grad_norm": 0.013571917079389095, + "learning_rate": 1e-06, + "loss": -0.009, + "step": 2258 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3343977779150009, + "epoch": 3.6201923076923075, + "grad_norm": 0.007707657292485237, + "learning_rate": 1e-06, + "loss": 0.0254, + "step": 2259 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3308611214160919, + "epoch": 3.621794871794872, + "grad_norm": 0.008745530620217323, + "learning_rate": 1e-06, + "loss": 0.0577, + "step": 2260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2865.0, + "completions/mean_length": 1843.185546875, + "completions/mean_terminated_length": 1612.3790283203125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 0.32520605623722076, + "epoch": 3.623397435897436, + "frac_reward_zero_std": 0.34375, + "grad_norm": 302.1920471191406, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 1330441019.0, + "reward": 0.3576836585998535, + "reward_std": 0.02488100342452526, + "rewards/progression_diversity/mean": -0.00360656064003706, + "rewards/progression_diversity/std": 0.02981680817902088, + "rewards/symbolic_reward_accuracy/mean": 0.216796875, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.7601073980331421, + "rewards/symbolic_reward_partial_score/std": 0.19392609596252441, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0611423254013062, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 10.63503646850586, + "step": 2261 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3336937874555588, + "epoch": 3.625, + "grad_norm": 0.02153988927602768, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 2262 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.33279934525489807, + "epoch": 3.626602564102564, + "grad_norm": 0.03544102981686592, + "learning_rate": 1e-06, + "loss": -0.0076, + "step": 2263 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.32690323889255524, + "epoch": 3.628205128205128, + "grad_norm": 0.018338177353143692, + "learning_rate": 1e-06, + "loss": 0.0575, + "step": 2264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3015.0, + "completions/mean_length": 1828.125, + "completions/mean_terminated_length": 1626.3604736328125, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "entropy": 0.32338719069957733, + "epoch": 3.6298076923076925, + "frac_reward_zero_std": 0.46875, + "grad_norm": 316.7826232910156, + "learning_rate": 1e-06, + "loss": 0.0849, + "num_tokens": 1332244379.0, + "reward": 0.2881593108177185, + "reward_std": 0.0218312069773674, + "rewards/progression_diversity/mean": -0.0034066529478877783, + "rewards/progression_diversity/std": 0.03087565302848816, + "rewards/symbolic_reward_accuracy/mean": 0.125, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.7152017951011658, + "rewards/symbolic_reward_partial_score/std": 0.19993343949317932, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0661569833755493, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 8.5504789352417, + "step": 2265 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.33843283355236053, + "epoch": 3.6314102564102564, + "grad_norm": 0.01766320690512657, + "learning_rate": 1e-06, + "loss": 0.0185, + "step": 2266 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.332566499710083, + "epoch": 3.6330128205128203, + "grad_norm": 0.01683247648179531, + "learning_rate": 1e-06, + "loss": 0.3716, + "step": 2267 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.334817498922348, + "epoch": 3.6346153846153846, + "grad_norm": 0.011439353227615356, + "learning_rate": 1e-06, + "loss": 0.0074, + "step": 2268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3406.0, + "completions/mean_length": 1817.625, + "completions/mean_terminated_length": 1644.9012451171875, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "entropy": 0.3378443419933319, + "epoch": 3.636217948717949, + "frac_reward_zero_std": 0.40625, + "grad_norm": 1454.8203125, + "learning_rate": 1e-06, + "loss": 0.0475, + "num_tokens": 1334038011.0, + "reward": 0.3225540518760681, + "reward_std": 0.03539721295237541, + "rewards/progression_diversity/mean": -0.0028966807294636965, + "rewards/progression_diversity/std": 0.027032606303691864, + "rewards/symbolic_reward_accuracy/mean": 0.181640625, + "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, + "rewards/symbolic_reward_partial_score/mean": 0.7139486074447632, + "rewards/symbolic_reward_partial_score/std": 0.20663875341415405, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0668741464614868, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 8.729013442993164, + "step": 2269 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.32664966583251953, + "epoch": 3.6378205128205128, + "grad_norm": 0.023835686966776848, + "learning_rate": 1e-06, + "loss": 0.0819, + "step": 2270 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3433900773525238, + "epoch": 3.6394230769230766, + "grad_norm": 0.015054881572723389, + "learning_rate": 1e-06, + "loss": -0.0076, + "step": 2271 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3428369015455246, + "epoch": 3.641025641025641, + "grad_norm": 0.014643248170614243, + "learning_rate": 1e-06, + "loss": -0.0061, + "step": 2272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2808.0, + "completions/mean_length": 2124.025390625, + "completions/mean_terminated_length": 1664.026123046875, + "completions/min_length": 885.0, + "completions/min_terminated_length": 885.0, + "entropy": 0.32794930040836334, + "epoch": 3.6426282051282053, + "frac_reward_zero_std": 0.4375, + "grad_norm": 337.78216552734375, + "learning_rate": 1e-06, + "loss": 0.064, + "num_tokens": 1335932760.0, + "reward": 0.40950584411621094, + "reward_std": 0.062236420810222626, + "rewards/progression_diversity/mean": -0.006936722435057163, + "rewards/progression_diversity/std": 0.04011997580528259, + "rewards/symbolic_reward_accuracy/mean": 0.302734375, + "rewards/symbolic_reward_accuracy/std": 0.45989060401916504, + "rewards/symbolic_reward_partial_score/mean": 0.7682454586029053, + "rewards/symbolic_reward_partial_score/std": 0.2247115969657898, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0450644493103027, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 19.491121292114258, + "step": 2273 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3140611946582794, + "epoch": 3.644230769230769, + "grad_norm": 11.444162368774414, + "learning_rate": 1e-06, + "loss": 0.0813, + "step": 2274 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3365636169910431, + "epoch": 3.6458333333333335, + "grad_norm": 0.028920264914631844, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 2275 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3288937658071518, + "epoch": 3.6474358974358974, + "grad_norm": 0.01110448595136404, + "learning_rate": 1e-06, + "loss": 0.0625, + "step": 2276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2847.0, + "completions/mean_length": 2268.736328125, + "completions/mean_terminated_length": 1694.945068359375, + "completions/min_length": 956.0, + "completions/min_terminated_length": 956.0, + "entropy": 0.3222566395998001, + "epoch": 3.6490384615384617, + "frac_reward_zero_std": 0.46875, + "grad_norm": 1264.3094482421875, + "learning_rate": 1e-06, + "loss": 0.043, + "num_tokens": 1337987217.0, + "reward": 0.5130249261856079, + "reward_std": 0.08471733331680298, + "rewards/progression_diversity/mean": -0.01001398079097271, + "rewards/progression_diversity/std": 0.0511719211935997, + "rewards/symbolic_reward_accuracy/mean": 0.4453125, + "rewards/symbolic_reward_accuracy/std": 0.49748632311820984, + "rewards/symbolic_reward_partial_score/mean": 0.8282551765441895, + "rewards/symbolic_reward_partial_score/std": 0.22973352670669556, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0400309562683105, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 21.84160614013672, + "step": 2277 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3258931040763855, + "epoch": 3.6506410256410255, + "grad_norm": 0.021348005160689354, + "learning_rate": 1e-06, + "loss": 0.0619, + "step": 2278 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31560730934143066, + "epoch": 3.65224358974359, + "grad_norm": 0.010525341145694256, + "learning_rate": 1e-06, + "loss": 0.0952, + "step": 2279 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3183016777038574, + "epoch": 3.6538461538461537, + "grad_norm": 0.01716870814561844, + "learning_rate": 1e-06, + "loss": 0.0683, + "step": 2280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2697.0, + "completions/mean_length": 1967.486328125, + "completions/mean_terminated_length": 1650.9560546875, + "completions/min_length": 1010.0, + "completions/min_terminated_length": 1010.0, + "entropy": 0.3132941722869873, + "epoch": 3.655448717948718, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1842.542236328125, + "learning_rate": 1e-06, + "loss": 0.1164, + "num_tokens": 1339971770.0, + "reward": 0.2741461396217346, + "reward_std": 0.0371134914457798, + "rewards/progression_diversity/mean": -0.006771244574338198, + "rewards/progression_diversity/std": 0.048090219497680664, + "rewards/symbolic_reward_accuracy/mean": 0.123046875, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.6731607913970947, + "rewards/symbolic_reward_partial_score/std": 0.20429718494415283, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0507025718688965, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 15.552701950073242, + "step": 2281 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3280630111694336, + "epoch": 3.657051282051282, + "grad_norm": 0.013712570071220398, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 2282 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3313450366258621, + "epoch": 3.6586538461538463, + "grad_norm": 0.0131281279027462, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 2283 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3225076198577881, + "epoch": 3.66025641025641, + "grad_norm": 0.02563142031431198, + "learning_rate": 1e-06, + "loss": 0.0724, + "step": 2284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3059.0, + "completions/mean_length": 1943.8203125, + "completions/mean_terminated_length": 1656.1673583984375, + "completions/min_length": 718.0, + "completions/min_terminated_length": 718.0, + "entropy": 0.31988005340099335, + "epoch": 3.6618589743589745, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1075.149658203125, + "learning_rate": 1e-06, + "loss": 0.0841, + "num_tokens": 1341844046.0, + "reward": 0.3518640398979187, + "reward_std": 0.06046595424413681, + "rewards/progression_diversity/mean": -0.005980104673653841, + "rewards/progression_diversity/std": 0.04314341023564339, + "rewards/symbolic_reward_accuracy/mean": 0.23046875, + "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, + "rewards/symbolic_reward_partial_score/mean": 0.7140950560569763, + "rewards/symbolic_reward_partial_score/std": 0.21391168236732483, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0547668933868408, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 14.504049301147461, + "step": 2285 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.33179762959480286, + "epoch": 3.6634615384615383, + "grad_norm": 0.028521860018372536, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 2286 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.324516236782074, + "epoch": 3.6650641025641026, + "grad_norm": 0.011499549262225628, + "learning_rate": 1e-06, + "loss": 0.0164, + "step": 2287 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3264227360486984, + "epoch": 3.6666666666666665, + "grad_norm": 1.3879690170288086, + "learning_rate": 1e-06, + "loss": 0.0003, + "step": 2288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2883.0, + "completions/mean_length": 1695.943359375, + "completions/mean_terminated_length": 1609.373291015625, + "completions/min_length": 943.0, + "completions/min_terminated_length": 943.0, + "entropy": 0.336402028799057, + "epoch": 3.668269230769231, + "frac_reward_zero_std": 0.5625, + "grad_norm": 696.1746826171875, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 1343498033.0, + "reward": 0.3187481164932251, + "reward_std": 0.019451720640063286, + "rewards/progression_diversity/mean": -0.0016582165844738483, + "rewards/progression_diversity/std": 0.02263886295258999, + "rewards/symbolic_reward_accuracy/mean": 0.150390625, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.7617676258087158, + "rewards/symbolic_reward_partial_score/std": 0.14879107475280762, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.07216477394104, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 5.351583480834961, + "step": 2289 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.334991991519928, + "epoch": 3.6698717948717947, + "grad_norm": 0.016783706843852997, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 2290 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3304772675037384, + "epoch": 3.671474358974359, + "grad_norm": 0.022302983328700066, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 2291 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3341434746980667, + "epoch": 3.6730769230769234, + "grad_norm": 0.012253678403794765, + "learning_rate": 1e-06, + "loss": 0.0019, + "step": 2292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3012.0, + "completions/mean_length": 1679.7109375, + "completions/mean_terminated_length": 1593.0452880859375, + "completions/min_length": 671.0, + "completions/min_terminated_length": 671.0, + "entropy": 0.32631611824035645, + "epoch": 3.6746794871794872, + "frac_reward_zero_std": 0.5625, + "grad_norm": 701.1774291992188, + "learning_rate": 1e-06, + "loss": 0.0256, + "num_tokens": 1345151549.0, + "reward": 0.3470765948295593, + "reward_std": 0.03532809019088745, + "rewards/progression_diversity/mean": -0.0018145160283893347, + "rewards/progression_diversity/std": 0.023674938827753067, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.737060546875, + "rewards/symbolic_reward_partial_score/std": 0.2012006640434265, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0714975595474243, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 5.101583480834961, + "step": 2293 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.33157244324684143, + "epoch": 3.676282051282051, + "grad_norm": 0.024722039699554443, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 2294 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3273026645183563, + "epoch": 3.6778846153846154, + "grad_norm": 0.015624015592038631, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 2295 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.33001944422721863, + "epoch": 3.6794871794871797, + "grad_norm": 0.014608138240873814, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 2296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2430.0, + "completions/mean_length": 1686.3046875, + "completions/mean_terminated_length": 1541.3570556640625, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.33564648032188416, + "epoch": 3.6810897435897436, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.027066614478826523, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 1346826425.0, + "reward": 0.43666860461235046, + "reward_std": 0.019992386922240257, + "rewards/progression_diversity/mean": -0.0030630123801529408, + "rewards/progression_diversity/std": 0.031911518424749374, + "rewards/symbolic_reward_accuracy/mean": 0.341796875, + "rewards/symbolic_reward_accuracy/std": 0.4747757613658905, + "rewards/symbolic_reward_partial_score/mean": 0.7746745347976685, + "rewards/symbolic_reward_partial_score/std": 0.22795653343200684, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0644441843032837, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 8.418769836425781, + "step": 2297 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3229387104511261, + "epoch": 3.6826923076923075, + "grad_norm": 9548.986328125, + "learning_rate": 1e-06, + "loss": 0.9497, + "step": 2298 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.31784258782863617, + "epoch": 3.684294871794872, + "grad_norm": 0.017409533262252808, + "learning_rate": 1e-06, + "loss": 0.0463, + "step": 2299 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.32778775691986084, + "epoch": 3.685897435897436, + "grad_norm": 0.01562940888106823, + "learning_rate": 1e-06, + "loss": -0.0083, + "step": 2300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2580.0, + "completions/mean_length": 1658.37109375, + "completions/mean_terminated_length": 1542.4212646484375, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "entropy": 0.3309648334980011, + "epoch": 3.6875, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.03380941227078438, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 1348522775.0, + "reward": 0.25678491592407227, + "reward_std": 0.02014780044555664, + "rewards/progression_diversity/mean": -0.001687450218014419, + "rewards/progression_diversity/std": 0.02322215586900711, + "rewards/symbolic_reward_accuracy/mean": 0.087890625, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.6808756589889526, + "rewards/symbolic_reward_partial_score/std": 0.177845299243927, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0702204704284668, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 5.331143379211426, + "step": 2301 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.32819345593452454, + "epoch": 3.689102564102564, + "grad_norm": 297679.28125, + "learning_rate": 1e-06, + "loss": 8.1288, + "step": 2302 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3345276117324829, + "epoch": 3.690705128205128, + "grad_norm": 0.007232366129755974, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 2303 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.32529085874557495, + "epoch": 3.6923076923076925, + "grad_norm": 31849.216796875, + "learning_rate": 1e-06, + "loss": 4.3377, + "step": 2304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2630.0, + "completions/mean_length": 1738.244140625, + "completions/mean_terminated_length": 1564.5791015625, + "completions/min_length": 800.0, + "completions/min_terminated_length": 800.0, + "entropy": 0.3245372474193573, + "epoch": 3.6939102564102564, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.024981984868645668, + "learning_rate": 1e-06, + "loss": 0.0394, + "num_tokens": 1350288916.0, + "reward": 0.2519104480743408, + "reward_std": 0.015460798516869545, + "rewards/progression_diversity/mean": -0.002804999705404043, + "rewards/progression_diversity/std": 0.027330690994858742, + "rewards/symbolic_reward_accuracy/mean": 0.0625, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.7173991203308105, + "rewards/symbolic_reward_partial_score/std": 0.16349852085113525, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.06309175491333, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 9.228836059570312, + "step": 2305 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.32626084983348846, + "epoch": 3.6955128205128203, + "grad_norm": 1247.531494140625, + "learning_rate": 1e-06, + "loss": 0.0508, + "step": 2306 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3255147635936737, + "epoch": 3.6971153846153846, + "grad_norm": 0.009068300016224384, + "learning_rate": 1e-06, + "loss": 0.5594, + "step": 2307 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.32918526232242584, + "epoch": 3.698717948717949, + "grad_norm": 0.017367210239171982, + "learning_rate": 1e-06, + "loss": -0.0038, + "step": 2308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3004.0, + "completions/mean_length": 1774.068359375, + "completions/mean_terminated_length": 1600.828125, + "completions/min_length": 883.0, + "completions/min_terminated_length": 883.0, + "entropy": 0.3189825713634491, + "epoch": 3.7003205128205128, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.019421013072133064, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 1352175175.0, + "reward": 0.33472028374671936, + "reward_std": 0.026918543502688408, + "rewards/progression_diversity/mean": -0.0030723384115844965, + "rewards/progression_diversity/std": 0.029028048738837242, + "rewards/symbolic_reward_accuracy/mean": 0.208984375, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.6978678703308105, + "rewards/symbolic_reward_partial_score/std": 0.19654740393161774, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0647794008255005, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 8.113250732421875, + "step": 2309 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.32003454864025116, + "epoch": 3.7019230769230766, + "grad_norm": 0.031067878007888794, + "learning_rate": 1e-06, + "loss": 0.4607, + "step": 2310 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.32604101300239563, + "epoch": 3.703525641025641, + "grad_norm": 7065.14501953125, + "learning_rate": 1e-06, + "loss": 0.5045, + "step": 2311 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.32465484738349915, + "epoch": 3.7051282051282053, + "grad_norm": 0.008808803744614124, + "learning_rate": 1e-06, + "loss": 0.004, + "step": 2312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2823.0, + "completions/mean_length": 1703.431640625, + "completions/mean_terminated_length": 1645.86083984375, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "entropy": 0.33033812046051025, + "epoch": 3.706730769230769, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.029636235907673836, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 1353921444.0, + "reward": 0.3808506727218628, + "reward_std": 0.02052554301917553, + "rewards/progression_diversity/mean": -0.0008730281260795891, + "rewards/progression_diversity/std": 0.01423015259206295, + "rewards/symbolic_reward_accuracy/mean": 0.263671875, + "rewards/symbolic_reward_accuracy/std": 0.4410543739795685, + "rewards/symbolic_reward_partial_score/mean": 0.7421875, + "rewards/symbolic_reward_partial_score/std": 0.20971913635730743, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0755610466003418, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 2.919158458709717, + "step": 2313 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3263317346572876, + "epoch": 3.7083333333333335, + "grad_norm": 0.009872586466372013, + "learning_rate": 1e-06, + "loss": 0.049, + "step": 2314 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.3279000520706177, + "epoch": 3.7099358974358974, + "grad_norm": 0.025471488013863564, + "learning_rate": 1e-06, + "loss": 0.0166, + "step": 2315 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.3354235142469406, + "epoch": 3.7115384615384617, + "grad_norm": 0.012011319398880005, + "learning_rate": 1e-06, + "loss": 0.0027, + "step": 2316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3046.0, + "completions/mean_length": 2033.783203125, + "completions/mean_terminated_length": 1747.92236328125, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "entropy": 0.31582309305667877, + "epoch": 3.7131410256410255, + "frac_reward_zero_std": 0.46875, + "grad_norm": 145.75550842285156, + "learning_rate": 1e-06, + "loss": 0.0422, + "num_tokens": 1355863077.0, + "reward": 0.37086308002471924, + "reward_std": 0.03677962347865105, + "rewards/progression_diversity/mean": -0.004513155668973923, + "rewards/progression_diversity/std": 0.03423220291733742, + "rewards/symbolic_reward_accuracy/mean": 0.244140625, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.7519856691360474, + "rewards/symbolic_reward_partial_score/std": 0.19709475338459015, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0543160438537598, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 12.873621940612793, + "step": 2317 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3110759109258652, + "epoch": 3.71474358974359, + "grad_norm": 2783.682861328125, + "learning_rate": 1e-06, + "loss": 1.9094, + "step": 2318 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31409376859664917, + "epoch": 3.7163461538461537, + "grad_norm": 5945.8662109375, + "learning_rate": 1e-06, + "loss": 0.319, + "step": 2319 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.32960157096385956, + "epoch": 3.717948717948718, + "grad_norm": 0.012837960384786129, + "learning_rate": 1e-06, + "loss": -0.0077, + "step": 2320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3660.0, + "completions/mean_length": 2013.263671875, + "completions/mean_terminated_length": 1814.0654296875, + "completions/min_length": 973.0, + "completions/min_terminated_length": 973.0, + "entropy": 0.32310742139816284, + "epoch": 3.719551282051282, + "frac_reward_zero_std": 0.5625, + "grad_norm": 137.2488250732422, + "learning_rate": 1e-06, + "loss": 0.0302, + "num_tokens": 1357745916.0, + "reward": 0.28083521127700806, + "reward_std": 0.030796613544225693, + "rewards/progression_diversity/mean": -0.002905955072492361, + "rewards/progression_diversity/std": 0.02647358365356922, + "rewards/symbolic_reward_accuracy/mean": 0.111328125, + "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, + "rewards/symbolic_reward_partial_score/mean": 0.7168131470680237, + "rewards/symbolic_reward_partial_score/std": 0.1985962688922882, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0641801357269287, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 7.660578727722168, + "step": 2321 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.31349869072437286, + "epoch": 3.7211538461538463, + "grad_norm": 15555.6708984375, + "learning_rate": 1e-06, + "loss": 6.2007, + "step": 2322 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.3223974406719208, + "epoch": 3.72275641025641, + "grad_norm": 968024.4375, + "learning_rate": 1e-06, + "loss": 23.6673, + "step": 2323 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.31636857986450195, + "epoch": 3.7243589743589745, + "grad_norm": 0.015157344751060009, + "learning_rate": 1e-06, + "loss": 0.0284, + "step": 2324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3458.0, + "completions/mean_length": 2236.482421875, + "completions/mean_terminated_length": 1838.760986328125, + "completions/min_length": 874.0, + "completions/min_terminated_length": 874.0, + "entropy": 0.3094794303178787, + "epoch": 3.7259615384615383, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.025273853912949562, + "learning_rate": 1e-06, + "loss": 0.0773, + "num_tokens": 1359728803.0, + "reward": 0.36245208978652954, + "reward_std": 0.04594516381621361, + "rewards/progression_diversity/mean": -0.005767707247287035, + "rewards/progression_diversity/std": 0.036357343196868896, + "rewards/symbolic_reward_accuracy/mean": 0.2421875, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.7291991710662842, + "rewards/symbolic_reward_partial_score/std": 0.22089123725891113, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.050485372543335, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 14.873160362243652, + "step": 2325 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.31531573832035065, + "epoch": 3.7275641025641026, + "grad_norm": 0.019093314185738564, + "learning_rate": 1e-06, + "loss": 0.0435, + "step": 2326 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3113931119441986, + "epoch": 3.7291666666666665, + "grad_norm": 0.015845902264118195, + "learning_rate": 1e-06, + "loss": 0.0502, + "step": 2327 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3193650245666504, + "epoch": 3.730769230769231, + "grad_norm": 0.1452263742685318, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 2328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3291.0, + "completions/mean_length": 2277.3984375, + "completions/mean_terminated_length": 1880.8272705078125, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "entropy": 0.32257337868213654, + "epoch": 3.7323717948717947, + "frac_reward_zero_std": 0.375, + "grad_norm": 67.19734954833984, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 1361753231.0, + "reward": 0.4234715700149536, + "reward_std": 0.052917372435331345, + "rewards/progression_diversity/mean": -0.00587063655257225, + "rewards/progression_diversity/std": 0.03647465631365776, + "rewards/symbolic_reward_accuracy/mean": 0.328125, + "rewards/symbolic_reward_accuracy/std": 0.4699897766113281, + "rewards/symbolic_reward_partial_score/mean": 0.7568196654319763, + "rewards/symbolic_reward_partial_score/std": 0.2349434494972229, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.053420066833496, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 13.831912994384766, + "step": 2329 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3165089935064316, + "epoch": 3.733974358974359, + "grad_norm": 1270778.375, + "learning_rate": 1e-06, + "loss": 109.9686, + "step": 2330 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31606537103652954, + "epoch": 3.7355769230769234, + "grad_norm": 241028.328125, + "learning_rate": 1e-06, + "loss": 29.3297, + "step": 2331 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.31982485949993134, + "epoch": 3.7371794871794872, + "grad_norm": 5490.81640625, + "learning_rate": 1e-06, + "loss": 1.0961, + "step": 2332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3391.0, + "completions/mean_length": 2758.09375, + "completions/mean_terminated_length": 1999.5382080078125, + "completions/min_length": 779.0, + "completions/min_terminated_length": 779.0, + "entropy": 0.3020293414592743, + "epoch": 3.738782051282051, + "frac_reward_zero_std": 0.28125, + "grad_norm": 2158.609375, + "learning_rate": 1e-06, + "loss": 0.0567, + "num_tokens": 1364147151.0, + "reward": 0.28579181432724, + "reward_std": 0.057038046419620514, + "rewards/progression_diversity/mean": -0.01163883414119482, + "rewards/progression_diversity/std": 0.052136827260255814, + "rewards/symbolic_reward_accuracy/mean": 0.140625, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.6769856810569763, + "rewards/symbolic_reward_partial_score/std": 0.21585947275161743, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0365989208221436, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 21.644445419311523, + "step": 2333 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.29430215060710907, + "epoch": 3.7403846153846154, + "grad_norm": 2930266.0, + "learning_rate": 1e-06, + "loss": 34.3929, + "step": 2334 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3125629723072052, + "epoch": 3.7419871794871797, + "grad_norm": 533435.4375, + "learning_rate": 1e-06, + "loss": 14.3577, + "step": 2335 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30381515622138977, + "epoch": 3.7435897435897436, + "grad_norm": 0.9284766912460327, + "learning_rate": 1e-06, + "loss": 0.0779, + "step": 2336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3698.0, + "completions/mean_length": 2933.576171875, + "completions/mean_terminated_length": 1976.8514404296875, + "completions/min_length": 802.0, + "completions/min_terminated_length": 802.0, + "entropy": 0.28462617099285126, + "epoch": 3.7451923076923075, + "frac_reward_zero_std": 0.15625, + "grad_norm": 4198.36865234375, + "learning_rate": 1e-06, + "loss": 0.1291, + "num_tokens": 1366628678.0, + "reward": 0.24286873638629913, + "reward_std": 0.05714884027838707, + "rewards/progression_diversity/mean": -0.014884944073855877, + "rewards/progression_diversity/std": 0.05795244127511978, + "rewards/symbolic_reward_accuracy/mean": 0.080078125, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.6570637822151184, + "rewards/symbolic_reward_partial_score/std": 0.2119654417037964, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0245647430419922, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 27.924781799316406, + "step": 2337 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3178318291902542, + "epoch": 3.746794871794872, + "grad_norm": 0.02091590128839016, + "learning_rate": 1e-06, + "loss": 0.039, + "step": 2338 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.29258735477924347, + "epoch": 3.748397435897436, + "grad_norm": 0.03195264935493469, + "learning_rate": 1e-06, + "loss": 0.0798, + "step": 2339 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3096400648355484, + "epoch": 3.75, + "grad_norm": 0.027371717616915703, + "learning_rate": 1e-06, + "loss": 0.0889, + "step": 2340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4023.0, + "completions/mean_length": 2866.087890625, + "completions/mean_terminated_length": 1994.87109375, + "completions/min_length": 936.0, + "completions/min_terminated_length": 936.0, + "entropy": 0.3162682503461838, + "epoch": 3.751602564102564, + "frac_reward_zero_std": 0.125, + "grad_norm": 2558.345458984375, + "learning_rate": 1e-06, + "loss": 0.0613, + "num_tokens": 1368975267.0, + "reward": 0.2645840644836426, + "reward_std": 0.05693788453936577, + "rewards/progression_diversity/mean": -0.013275885954499245, + "rewards/progression_diversity/std": 0.05504615232348442, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.6545247435569763, + "rewards/symbolic_reward_partial_score/std": 0.22332006692886353, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.033745288848877, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 23.267847061157227, + "step": 2341 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3221198171377182, + "epoch": 3.753205128205128, + "grad_norm": 0.016970986500382423, + "learning_rate": 1e-06, + "loss": 0.0128, + "step": 2342 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2914904057979584, + "epoch": 3.7548076923076925, + "grad_norm": 0.016527332365512848, + "learning_rate": 1e-06, + "loss": 0.1016, + "step": 2343 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.29611217975616455, + "epoch": 3.7564102564102564, + "grad_norm": 0.015034251846373081, + "learning_rate": 1e-06, + "loss": 0.1444, + "step": 2344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3699.0, + "completions/mean_length": 2942.474609375, + "completions/mean_terminated_length": 1956.1990966796875, + "completions/min_length": 891.0, + "completions/min_terminated_length": 891.0, + "entropy": 0.324889600276947, + "epoch": 3.7580128205128203, + "frac_reward_zero_std": 0.21875, + "grad_norm": 564.7570190429688, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 1371389206.0, + "reward": 0.4041377902030945, + "reward_std": 0.0859639048576355, + "rewards/progression_diversity/mean": -0.01590690203011036, + "rewards/progression_diversity/std": 0.06111828610301018, + "rewards/symbolic_reward_accuracy/mean": 0.314453125, + "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, + "rewards/symbolic_reward_partial_score/mean": 0.7259114384651184, + "rewards/symbolic_reward_partial_score/std": 0.2665213644504547, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0233384370803833, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 29.020061492919922, + "step": 2345 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2806452810764313, + "epoch": 3.7596153846153846, + "grad_norm": 0.017962336540222168, + "learning_rate": 1e-06, + "loss": 0.1727, + "step": 2346 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3103862851858139, + "epoch": 3.761217948717949, + "grad_norm": 0.013579954393208027, + "learning_rate": 1e-06, + "loss": 0.0811, + "step": 2347 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.305005207657814, + "epoch": 3.7628205128205128, + "grad_norm": 0.01808071881532669, + "learning_rate": 1e-06, + "loss": 0.1148, + "step": 2348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3296.0, + "completions/mean_length": 2663.65234375, + "completions/mean_terminated_length": 1959.322509765625, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "entropy": 0.33493170142173767, + "epoch": 3.7644230769230766, + "frac_reward_zero_std": 0.28125, + "grad_norm": 88.44670867919922, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 1373608292.0, + "reward": 0.3504989743232727, + "reward_std": 0.0687357485294342, + "rewards/progression_diversity/mean": -0.013580359518527985, + "rewards/progression_diversity/std": 0.062451787292957306, + "rewards/symbolic_reward_accuracy/mean": 0.22265625, + "rewards/symbolic_reward_accuracy/std": 0.41643625497817993, + "rewards/symbolic_reward_partial_score/mean": 0.7293294668197632, + "rewards/symbolic_reward_partial_score/std": 0.23006510734558105, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0389269590377808, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 22.056781768798828, + "step": 2349 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3003799021244049, + "epoch": 3.766025641025641, + "grad_norm": 0.014394021593034267, + "learning_rate": 1e-06, + "loss": 0.1444, + "step": 2350 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3186776787042618, + "epoch": 3.7676282051282053, + "grad_norm": 0.02413056790828705, + "learning_rate": 1e-06, + "loss": 0.0553, + "step": 2351 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.32591530680656433, + "epoch": 3.769230769230769, + "grad_norm": 0.01364830881357193, + "learning_rate": 1e-06, + "loss": 0.0332, + "step": 2352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3292.0, + "completions/mean_length": 3137.02734375, + "completions/mean_terminated_length": 1953.2552490234375, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "entropy": 0.30336521565914154, + "epoch": 3.7708333333333335, + "frac_reward_zero_std": 0.21875, + "grad_norm": 689.3433227539062, + "learning_rate": 1e-06, + "loss": 0.0575, + "num_tokens": 1376180162.0, + "reward": 0.30668240785598755, + "reward_std": 0.07010701298713684, + "rewards/progression_diversity/mean": -0.021703477948904037, + "rewards/progression_diversity/std": 0.07450754940509796, + "rewards/symbolic_reward_accuracy/mean": 0.171875, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.6844563484191895, + "rewards/symbolic_reward_partial_score/std": 0.23659244179725647, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0194364786148071, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 32.71318054199219, + "step": 2353 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3008395582437515, + "epoch": 3.7724358974358974, + "grad_norm": 0.02489347755908966, + "learning_rate": 1e-06, + "loss": 0.0919, + "step": 2354 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3014819622039795, + "epoch": 3.7740384615384617, + "grad_norm": 0.018208302557468414, + "learning_rate": 1e-06, + "loss": 0.1138, + "step": 2355 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30451202392578125, + "epoch": 3.7756410256410255, + "grad_norm": 0.03824426978826523, + "learning_rate": 1e-06, + "loss": 0.049, + "step": 2356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3241.0, + "completions/mean_length": 2671.005859375, + "completions/mean_terminated_length": 1937.3887939453125, + "completions/min_length": 908.0, + "completions/min_terminated_length": 908.0, + "entropy": 0.32287219166755676, + "epoch": 3.77724358974359, + "frac_reward_zero_std": 0.34375, + "grad_norm": 901.6248779296875, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 1378516597.0, + "reward": 0.2821524143218994, + "reward_std": 0.06520838290452957, + "rewards/progression_diversity/mean": -0.015226154588162899, + "rewards/progression_diversity/std": 0.06770697236061096, + "rewards/symbolic_reward_accuracy/mean": 0.126953125, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.6903645992279053, + "rewards/symbolic_reward_partial_score/std": 0.22543098032474518, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0395439863204956, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 22.674461364746094, + "step": 2357 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30759990215301514, + "epoch": 3.7788461538461537, + "grad_norm": 0.5585769414901733, + "learning_rate": 1e-06, + "loss": 0.0984, + "step": 2358 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3166644871234894, + "epoch": 3.780448717948718, + "grad_norm": 0.1928190290927887, + "learning_rate": 1e-06, + "loss": 0.0079, + "step": 2359 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3207753002643585, + "epoch": 3.782051282051282, + "grad_norm": 0.02237272635102272, + "learning_rate": 1e-06, + "loss": 0.0525, + "step": 2360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3211.0, + "completions/mean_length": 2510.060546875, + "completions/mean_terminated_length": 1887.14892578125, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "entropy": 0.3191475421190262, + "epoch": 3.7836538461538463, + "frac_reward_zero_std": 0.21875, + "grad_norm": 950.1478271484375, + "learning_rate": 1e-06, + "loss": 0.0732, + "num_tokens": 1380652708.0, + "reward": 0.36586660146713257, + "reward_std": 0.09463842213153839, + "rewards/progression_diversity/mean": -0.011485631577670574, + "rewards/progression_diversity/std": 0.05633028969168663, + "rewards/symbolic_reward_accuracy/mean": 0.2421875, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.7420735359191895, + "rewards/symbolic_reward_partial_score/std": 0.243971049785614, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.038116455078125, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 23.570289611816406, + "step": 2361 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.33112041652202606, + "epoch": 3.78525641025641, + "grad_norm": 14.519754409790039, + "learning_rate": 1e-06, + "loss": 0.061, + "step": 2362 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.33028391003608704, + "epoch": 3.7868589743589745, + "grad_norm": 0.024247992783784866, + "learning_rate": 1e-06, + "loss": 0.0352, + "step": 2363 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3137480616569519, + "epoch": 3.7884615384615383, + "grad_norm": 0.031128326430916786, + "learning_rate": 1e-06, + "loss": 0.0846, + "step": 2364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2966.0, + "completions/mean_length": 2746.02734375, + "completions/mean_terminated_length": 1867.07275390625, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "entropy": 0.30356088280677795, + "epoch": 3.7900641025641026, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1132.28857421875, + "learning_rate": 1e-06, + "loss": 0.1076, + "num_tokens": 1382941074.0, + "reward": 0.30738329887390137, + "reward_std": 0.057456813752651215, + "rewards/progression_diversity/mean": -0.017531972378492355, + "rewards/progression_diversity/std": 0.07124343514442444, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.6723307371139526, + "rewards/symbolic_reward_partial_score/std": 0.23962588608264923, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0223664045333862, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 30.863096237182617, + "step": 2365 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.306650310754776, + "epoch": 3.7916666666666665, + "grad_norm": 0.026560494676232338, + "learning_rate": 1e-06, + "loss": 0.1009, + "step": 2366 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.313573881983757, + "epoch": 3.793269230769231, + "grad_norm": 0.01731313206255436, + "learning_rate": 1e-06, + "loss": 0.1042, + "step": 2367 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.32431718707084656, + "epoch": 3.7948717948717947, + "grad_norm": 0.012698481790721416, + "learning_rate": 1e-06, + "loss": 0.0471, + "step": 2368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3138.0, + "completions/mean_length": 2853.966796875, + "completions/mean_terminated_length": 1891.5794677734375, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "entropy": 0.31773290038108826, + "epoch": 3.796474358974359, + "frac_reward_zero_std": 0.15625, + "grad_norm": 901.328369140625, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 1385256705.0, + "reward": 0.2686350345611572, + "reward_std": 0.06755457073450089, + "rewards/progression_diversity/mean": -0.018334418535232544, + "rewards/progression_diversity/std": 0.07072144001722336, + "rewards/symbolic_reward_accuracy/mean": 0.09765625, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.7098632454872131, + "rewards/symbolic_reward_partial_score/std": 0.21099787950515747, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.017049789428711, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 34.733001708984375, + "step": 2369 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.33099713921546936, + "epoch": 3.7980769230769234, + "grad_norm": 0.01458797324448824, + "learning_rate": 1e-06, + "loss": 0.0647, + "step": 2370 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2994411885738373, + "epoch": 3.7996794871794872, + "grad_norm": 0.03737116977572441, + "learning_rate": 1e-06, + "loss": 0.1793, + "step": 2371 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.32057875394821167, + "epoch": 3.801282051282051, + "grad_norm": 0.011851564049720764, + "learning_rate": 1e-06, + "loss": 0.0763, + "step": 2372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3275.0, + "completions/mean_length": 2494.095703125, + "completions/mean_terminated_length": 1870.46728515625, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.321204349398613, + "epoch": 3.8028846153846154, + "frac_reward_zero_std": 0.40625, + "grad_norm": 464.8273620605469, + "learning_rate": 1e-06, + "loss": 0.0536, + "num_tokens": 1387418002.0, + "reward": 0.38406121730804443, + "reward_std": 0.06720907241106033, + "rewards/progression_diversity/mean": -0.011848336085677147, + "rewards/progression_diversity/std": 0.05758029222488403, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.7552083730697632, + "rewards/symbolic_reward_partial_score/std": 0.21863536536693573, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.042336344718933, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 22.52627944946289, + "step": 2373 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.33628255128860474, + "epoch": 3.8044871794871797, + "grad_norm": 16.08426856994629, + "learning_rate": 1e-06, + "loss": 0.047, + "step": 2374 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3366074562072754, + "epoch": 3.8060897435897436, + "grad_norm": 4.6837029457092285, + "learning_rate": 1e-06, + "loss": 0.0627, + "step": 2375 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.32915179431438446, + "epoch": 3.8076923076923075, + "grad_norm": 0.2614407241344452, + "learning_rate": 1e-06, + "loss": 0.0403, + "step": 2376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3255.0, + "completions/mean_length": 2107.373046875, + "completions/mean_terminated_length": 1880.760009765625, + "completions/min_length": 697.0, + "completions/min_terminated_length": 697.0, + "entropy": 0.33755332231521606, + "epoch": 3.809294871794872, + "frac_reward_zero_std": 0.46875, + "grad_norm": 317.5009460449219, + "learning_rate": 1e-06, + "loss": 0.0604, + "num_tokens": 1389344513.0, + "reward": 0.2682853937149048, + "reward_std": 0.022097572684288025, + "rewards/progression_diversity/mean": -0.003493726020678878, + "rewards/progression_diversity/std": 0.02860984578728676, + "rewards/symbolic_reward_accuracy/mean": 0.091796875, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.712109386920929, + "rewards/symbolic_reward_partial_score/std": 0.19135794043540955, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.067301869392395, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 10.378652572631836, + "step": 2377 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3546573221683502, + "epoch": 3.810897435897436, + "grad_norm": 0.024478528648614883, + "learning_rate": 1e-06, + "loss": 0.2443, + "step": 2378 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.36258837580680847, + "epoch": 3.8125, + "grad_norm": 0.020279182121157646, + "learning_rate": 1e-06, + "loss": 0.0001, + "step": 2379 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3494533598423004, + "epoch": 3.814102564102564, + "grad_norm": 0.021380124613642693, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 2380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3135.0, + "completions/mean_length": 2129.833984375, + "completions/mean_terminated_length": 1874.7891845703125, + "completions/min_length": 1129.0, + "completions/min_terminated_length": 1129.0, + "entropy": 0.33585888147354126, + "epoch": 3.815705128205128, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1741.7381591796875, + "learning_rate": 1e-06, + "loss": 0.0675, + "num_tokens": 1391386876.0, + "reward": 0.23786211013793945, + "reward_std": 0.01681671291589737, + "rewards/progression_diversity/mean": -0.0043185316026210785, + "rewards/progression_diversity/std": 0.03434579074382782, + "rewards/symbolic_reward_accuracy/mean": 0.0625, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.6706217527389526, + "rewards/symbolic_reward_partial_score/std": 0.17827239632606506, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0659494400024414, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 11.326656341552734, + "step": 2381 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.35594816505908966, + "epoch": 3.8173076923076925, + "grad_norm": 0.012054548598825932, + "learning_rate": 1e-06, + "loss": 0.001, + "step": 2382 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3579705059528351, + "epoch": 3.8189102564102564, + "grad_norm": 0.025561781600117683, + "learning_rate": 1e-06, + "loss": 0.0028, + "step": 2383 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.33812710642814636, + "epoch": 3.8205128205128203, + "grad_norm": 0.027250172570347786, + "learning_rate": 1e-06, + "loss": 0.0297, + "step": 2384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3329.0, + "completions/mean_length": 2672.564453125, + "completions/mean_terminated_length": 1819.1556396484375, + "completions/min_length": 961.0, + "completions/min_terminated_length": 961.0, + "entropy": 0.3372623771429062, + "epoch": 3.8221153846153846, + "frac_reward_zero_std": 0.34375, + "grad_norm": 106.94781494140625, + "learning_rate": 1e-06, + "loss": -0.0134, + "num_tokens": 1393736781.0, + "reward": 0.338218629360199, + "reward_std": 0.033895812928676605, + "rewards/progression_diversity/mean": -0.015542363747954369, + "rewards/progression_diversity/std": 0.06333222985267639, + "rewards/symbolic_reward_accuracy/mean": 0.21484375, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.7027831673622131, + "rewards/symbolic_reward_partial_score/std": 0.22551730275154114, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0194793939590454, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 35.487525939941406, + "step": 2385 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3197958320379257, + "epoch": 3.823717948717949, + "grad_norm": 3424.5888671875, + "learning_rate": 1e-06, + "loss": 0.4028, + "step": 2386 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3210483342409134, + "epoch": 3.8253205128205128, + "grad_norm": 0.28248870372772217, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 2387 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.31268228590488434, + "epoch": 3.8269230769230766, + "grad_norm": 0.01564093679189682, + "learning_rate": 1e-06, + "loss": 0.1354, + "step": 2388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2804.0, + "completions/mean_length": 2821.177734375, + "completions/mean_terminated_length": 1764.7052001953125, + "completions/min_length": 957.0, + "completions/min_terminated_length": 957.0, + "entropy": 0.3302348703145981, + "epoch": 3.828525641025641, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1121.2408447265625, + "learning_rate": 1e-06, + "loss": 0.0995, + "num_tokens": 1395997880.0, + "reward": 0.3832484483718872, + "reward_std": 0.039947956800460815, + "rewards/progression_diversity/mean": -0.017442844808101654, + "rewards/progression_diversity/std": 0.06394115090370178, + "rewards/symbolic_reward_accuracy/mean": 0.2734375, + "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, + "rewards/symbolic_reward_partial_score/mean": 0.733154296875, + "rewards/symbolic_reward_partial_score/std": 0.22632841765880585, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0048656463623047, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 43.669517517089844, + "step": 2389 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2924426198005676, + "epoch": 3.8301282051282053, + "grad_norm": 1060.7529296875, + "learning_rate": 1e-06, + "loss": 0.3549, + "step": 2390 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.34059783816337585, + "epoch": 3.831730769230769, + "grad_norm": 1.79716956615448, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 2391 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3371441662311554, + "epoch": 3.8333333333333335, + "grad_norm": 15.651530265808105, + "learning_rate": 1e-06, + "loss": 0.0659, + "step": 2392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2952.0, + "completions/mean_length": 2100.259765625, + "completions/mean_terminated_length": 1786.6446533203125, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "entropy": 0.3499729782342911, + "epoch": 3.8349358974358974, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.016351254656910896, + "learning_rate": 1e-06, + "loss": 0.0287, + "num_tokens": 1397980253.0, + "reward": 0.4495462477207184, + "reward_std": 0.04341159760951996, + "rewards/progression_diversity/mean": -0.006314532831311226, + "rewards/progression_diversity/std": 0.04331482574343681, + "rewards/symbolic_reward_accuracy/mean": 0.361328125, + "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, + "rewards/symbolic_reward_partial_score/mean": 0.779296875, + "rewards/symbolic_reward_partial_score/std": 0.22735320031642914, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0571317672729492, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 16.10877227783203, + "step": 2393 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3411282151937485, + "epoch": 3.8365384615384617, + "grad_norm": 8039.83837890625, + "learning_rate": 1e-06, + "loss": 0.8501, + "step": 2394 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.356269970536232, + "epoch": 3.8381410256410255, + "grad_norm": 289.86322021484375, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 2395 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3452070653438568, + "epoch": 3.83974358974359, + "grad_norm": 0.014756686054170132, + "learning_rate": 1e-06, + "loss": 0.0689, + "step": 2396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4212.0, + "completions/mean_length": 2003.048828125, + "completions/mean_terminated_length": 1774.7799072265625, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "entropy": 0.3432788699865341, + "epoch": 3.8413461538461537, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1194.415771484375, + "learning_rate": 1e-06, + "loss": 0.0683, + "num_tokens": 1399852966.0, + "reward": 0.3248797655105591, + "reward_std": 0.023387808352708817, + "rewards/progression_diversity/mean": -0.004212523810565472, + "rewards/progression_diversity/std": 0.034703828394412994, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.7139322757720947, + "rewards/symbolic_reward_partial_score/std": 0.2121579349040985, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0637973546981812, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 12.788324356079102, + "step": 2397 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.36405032873153687, + "epoch": 3.842948717948718, + "grad_norm": 0.022353997454047203, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 2398 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.35727696120738983, + "epoch": 3.844551282051282, + "grad_norm": 1.3744477033615112, + "learning_rate": 1e-06, + "loss": -0.002, + "step": 2399 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.35195624828338623, + "epoch": 3.8461538461538463, + "grad_norm": 0.02140852063894272, + "learning_rate": 1e-06, + "loss": 0.0567, + "step": 2400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3118.0, + "completions/mean_length": 1852.724609375, + "completions/mean_terminated_length": 1767.07861328125, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "entropy": 0.36357755959033966, + "epoch": 3.84775641025641, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.02421417459845543, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 1401668985.0, + "reward": 0.31430160999298096, + "reward_std": 0.01744687370955944, + "rewards/progression_diversity/mean": -0.001969876466318965, + "rewards/progression_diversity/std": 0.026346473023295403, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.739794909954071, + "rewards/symbolic_reward_partial_score/std": 0.19770273566246033, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0794093608856201, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 5.0978240966796875, + "step": 2401 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.35801486670970917, + "epoch": 3.8493589743589745, + "grad_norm": 0.02062184363603592, + "learning_rate": 1e-06, + "loss": 36.5217, + "step": 2402 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.36254678666591644, + "epoch": 3.8509615384615383, + "grad_norm": 0.024333849549293518, + "learning_rate": 1e-06, + "loss": 216.8933, + "step": 2403 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.3597784638404846, + "epoch": 3.8525641025641026, + "grad_norm": 0.016081588342785835, + "learning_rate": 1e-06, + "loss": 2.3758, + "step": 2404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2842.0, + "completions/mean_length": 1912.380859375, + "completions/mean_terminated_length": 1798.43115234375, + "completions/min_length": 1112.0, + "completions/min_terminated_length": 1112.0, + "entropy": 0.3635348379611969, + "epoch": 3.8541666666666665, + "frac_reward_zero_std": 0.53125, + "grad_norm": 754.1912231445312, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 1403545500.0, + "reward": 0.31084516644477844, + "reward_std": 0.025590339675545692, + "rewards/progression_diversity/mean": -0.002397148869931698, + "rewards/progression_diversity/std": 0.027250634506344795, + "rewards/symbolic_reward_accuracy/mean": 0.17578125, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.6853190660476685, + "rewards/symbolic_reward_partial_score/std": 0.19453242421150208, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0779601335525513, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 6.665762901306152, + "step": 2405 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.3639901280403137, + "epoch": 3.855769230769231, + "grad_norm": 0.02506587654352188, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 2406 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.36371636390686035, + "epoch": 3.8573717948717947, + "grad_norm": 0.019767044112086296, + "learning_rate": 1e-06, + "loss": 0.0172, + "step": 2407 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.37076058983802795, + "epoch": 3.858974358974359, + "grad_norm": 0.021978937089443207, + "learning_rate": 1e-06, + "loss": -0.0016, + "step": 2408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3139.0, + "completions/mean_length": 1938.568359375, + "completions/mean_terminated_length": 1796.1085205078125, + "completions/min_length": 1101.0, + "completions/min_terminated_length": 1101.0, + "entropy": 0.3613286018371582, + "epoch": 3.8605769230769234, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.014481249265372753, + "learning_rate": 1e-06, + "loss": 0.0153, + "num_tokens": 1405497455.0, + "reward": 0.3004390001296997, + "reward_std": 0.0335419736802578, + "rewards/progression_diversity/mean": -0.0029758564196527004, + "rewards/progression_diversity/std": 0.031783487647771835, + "rewards/symbolic_reward_accuracy/mean": 0.14453125, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.7151041626930237, + "rewards/symbolic_reward_partial_score/std": 0.19497309625148773, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.074684500694275, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 8.23452377319336, + "step": 2409 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3613293021917343, + "epoch": 3.8621794871794872, + "grad_norm": 0.0199386365711689, + "learning_rate": 1e-06, + "loss": 0.0316, + "step": 2410 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3616602122783661, + "epoch": 3.863782051282051, + "grad_norm": 0.5316094756126404, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 2411 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.36158451437950134, + "epoch": 3.8653846153846154, + "grad_norm": 0.012657379731535912, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 2412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3447.0, + "completions/mean_length": 1951.318359375, + "completions/mean_terminated_length": 1837.6751708984375, + "completions/min_length": 755.0, + "completions/min_terminated_length": 755.0, + "entropy": 0.3670198619365692, + "epoch": 3.8669871794871797, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.029966576024889946, + "learning_rate": 1e-06, + "loss": 0.0243, + "num_tokens": 1407303266.0, + "reward": 0.29933494329452515, + "reward_std": 0.02662837505340576, + "rewards/progression_diversity/mean": -0.001564997830428183, + "rewards/progression_diversity/std": 0.01767376810312271, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.693798840045929, + "rewards/symbolic_reward_partial_score/std": 0.1757093071937561, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.079904317855835, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 6.317461967468262, + "step": 2413 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.380515456199646, + "epoch": 3.8685897435897436, + "grad_norm": 0.014633157290518284, + "learning_rate": 1e-06, + "loss": -0.0096, + "step": 2414 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.36171650886535645, + "epoch": 3.8701923076923075, + "grad_norm": 0.019229549914598465, + "learning_rate": 1e-06, + "loss": 0.0416, + "step": 2415 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3781423568725586, + "epoch": 3.871794871794872, + "grad_norm": 0.031127827242016792, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 2416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3458.0, + "completions/mean_length": 2027.650390625, + "completions/mean_terminated_length": 1914.6082763671875, + "completions/min_length": 1041.0, + "completions/min_terminated_length": 1041.0, + "entropy": 0.37264350056648254, + "epoch": 3.873397435897436, + "frac_reward_zero_std": 0.4375, + "grad_norm": 181.88681030273438, + "learning_rate": 1e-06, + "loss": 0.0241, + "num_tokens": 1409189023.0, + "reward": 0.33668702840805054, + "reward_std": 0.035758331418037415, + "rewards/progression_diversity/mean": -0.0017103657592087984, + "rewards/progression_diversity/std": 0.020190343260765076, + "rewards/symbolic_reward_accuracy/mean": 0.20703125, + "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, + "rewards/symbolic_reward_partial_score/mean": 0.7089356184005737, + "rewards/symbolic_reward_partial_score/std": 0.2059522420167923, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.079702377319336, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 5.716109275817871, + "step": 2417 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3729843199253082, + "epoch": 3.875, + "grad_norm": 0.0172417052090168, + "learning_rate": 1e-06, + "loss": -0.0047, + "step": 2418 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3554026186466217, + "epoch": 3.876602564102564, + "grad_norm": 0.014652607962489128, + "learning_rate": 1e-06, + "loss": 0.0365, + "step": 2419 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.36666855216026306, + "epoch": 3.878205128205128, + "grad_norm": 0.025280388072133064, + "learning_rate": 1e-06, + "loss": 0.0004, + "step": 2420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3165.0, + "completions/mean_length": 2169.5234375, + "completions/mean_terminated_length": 1972.4912109375, + "completions/min_length": 1212.0, + "completions/min_terminated_length": 1212.0, + "entropy": 0.35489410161972046, + "epoch": 3.8798076923076925, + "frac_reward_zero_std": 0.5, + "grad_norm": 993.462158203125, + "learning_rate": 1e-06, + "loss": 0.0898, + "num_tokens": 1411120619.0, + "reward": 0.3209129571914673, + "reward_std": 0.036054469645023346, + "rewards/progression_diversity/mean": -0.0024563795886933804, + "rewards/progression_diversity/std": 0.022509947419166565, + "rewards/symbolic_reward_accuracy/mean": 0.16796875, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.7371094226837158, + "rewards/symbolic_reward_partial_score/std": 0.17446953058242798, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0719996690750122, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 9.719746589660645, + "step": 2421 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.372598797082901, + "epoch": 3.8814102564102564, + "grad_norm": 0.00928138755261898, + "learning_rate": 1e-06, + "loss": -0.0118, + "step": 2422 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3691015988588333, + "epoch": 3.8830128205128203, + "grad_norm": 0.018029429018497467, + "learning_rate": 1e-06, + "loss": 0.0338, + "step": 2423 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.36332499980926514, + "epoch": 3.8846153846153846, + "grad_norm": 0.013046249747276306, + "learning_rate": 1e-06, + "loss": 0.0302, + "step": 2424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3514.0, + "completions/mean_length": 2295.619140625, + "completions/mean_terminated_length": 1957.498046875, + "completions/min_length": 1022.0, + "completions/min_terminated_length": 1022.0, + "entropy": 0.3619091063737869, + "epoch": 3.886217948717949, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1141.9002685546875, + "learning_rate": 1e-06, + "loss": 0.0544, + "num_tokens": 1413051752.0, + "reward": 0.3810175061225891, + "reward_std": 0.04954010993242264, + "rewards/progression_diversity/mean": -0.00469579640775919, + "rewards/progression_diversity/std": 0.03242946416139603, + "rewards/symbolic_reward_accuracy/mean": 0.2421875, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.789746105670929, + "rewards/symbolic_reward_partial_score/std": 0.18808871507644653, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0633162260055542, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 14.651718139648438, + "step": 2425 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.36615803837776184, + "epoch": 3.8878205128205128, + "grad_norm": 833.3787231445312, + "learning_rate": 1e-06, + "loss": 0.0552, + "step": 2426 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.37386661767959595, + "epoch": 3.8894230769230766, + "grad_norm": 0.020444680005311966, + "learning_rate": 1e-06, + "loss": 0.0178, + "step": 2427 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3629365861415863, + "epoch": 3.891025641025641, + "grad_norm": 0.02197657711803913, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 2428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3316.0, + "completions/mean_length": 2237.09375, + "completions/mean_terminated_length": 1983.9681396484375, + "completions/min_length": 1003.0, + "completions/min_terminated_length": 1003.0, + "entropy": 0.36241379380226135, + "epoch": 3.8926282051282053, + "frac_reward_zero_std": 0.53125, + "grad_norm": 351.0287780761719, + "learning_rate": 1e-06, + "loss": 0.0279, + "num_tokens": 1415054008.0, + "reward": 0.33530277013778687, + "reward_std": 0.013850128278136253, + "rewards/progression_diversity/mean": -0.0029261300805956125, + "rewards/progression_diversity/std": 0.02486591413617134, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.7447265386581421, + "rewards/symbolic_reward_partial_score/std": 0.18490070104599, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0711203813552856, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 10.22694206237793, + "step": 2429 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3631092756986618, + "epoch": 3.894230769230769, + "grad_norm": 0.015336292795836926, + "learning_rate": 1e-06, + "loss": 0.0474, + "step": 2430 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3701094090938568, + "epoch": 3.8958333333333335, + "grad_norm": 0.019072409719228745, + "learning_rate": 1e-06, + "loss": -0.0113, + "step": 2431 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.36329878866672516, + "epoch": 3.8974358974358974, + "grad_norm": 0.009201987646520138, + "learning_rate": 1e-06, + "loss": 0.0567, + "step": 2432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3290.0, + "completions/mean_length": 2161.703125, + "completions/mean_terminated_length": 1964.5623779296875, + "completions/min_length": 1002.0, + "completions/min_terminated_length": 1002.0, + "entropy": 0.36919447779655457, + "epoch": 3.8990384615384617, + "frac_reward_zero_std": 0.46875, + "grad_norm": 651.1842041015625, + "learning_rate": 1e-06, + "loss": 0.0223, + "num_tokens": 1417071456.0, + "reward": 0.26072680950164795, + "reward_std": 0.03266744688153267, + "rewards/progression_diversity/mean": -0.0025163046084344387, + "rewards/progression_diversity/std": 0.024175414815545082, + "rewards/symbolic_reward_accuracy/mean": 0.0859375, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.6992512941360474, + "rewards/symbolic_reward_partial_score/std": 0.1867949366569519, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0748156309127808, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 8.038581848144531, + "step": 2433 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3648574501276016, + "epoch": 3.9006410256410255, + "grad_norm": 0.026968425139784813, + "learning_rate": 1e-06, + "loss": 0.0404, + "step": 2434 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3710328936576843, + "epoch": 3.90224358974359, + "grad_norm": 0.022169405594468117, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 2435 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3560585528612137, + "epoch": 3.9038461538461537, + "grad_norm": 0.012255042791366577, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 2436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3374.0, + "completions/mean_length": 2268.74609375, + "completions/mean_terminated_length": 1987.5657958984375, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "entropy": 0.36314675211906433, + "epoch": 3.905448717948718, + "frac_reward_zero_std": 0.59375, + "grad_norm": 26.744871139526367, + "learning_rate": 1e-06, + "loss": 0.0218, + "num_tokens": 1419090318.0, + "reward": 0.41112661361694336, + "reward_std": 0.022687647491693497, + "rewards/progression_diversity/mean": -0.0030626305378973484, + "rewards/progression_diversity/std": 0.022572841495275497, + "rewards/symbolic_reward_accuracy/mean": 0.306640625, + "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, + "rewards/symbolic_reward_partial_score/mean": 0.7591959238052368, + "rewards/symbolic_reward_partial_score/std": 0.19385521113872528, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0674618482589722, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 11.076675415039062, + "step": 2437 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3628970682621002, + "epoch": 3.907051282051282, + "grad_norm": 0.018419716507196426, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 2438 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.34759269654750824, + "epoch": 3.9086538461538463, + "grad_norm": 0.008998063392937183, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 2439 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3605688661336899, + "epoch": 3.91025641025641, + "grad_norm": 0.010430578142404556, + "learning_rate": 1e-06, + "loss": 0.0363, + "step": 2440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3622.0, + "completions/mean_length": 2239.3984375, + "completions/mean_terminated_length": 2014.881103515625, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.3595893830060959, + "epoch": 3.9118589743589745, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.020290104672312737, + "learning_rate": 1e-06, + "loss": 0.0345, + "num_tokens": 1421054490.0, + "reward": 0.3346139192581177, + "reward_std": 0.03494077920913696, + "rewards/progression_diversity/mean": -0.002476356690749526, + "rewards/progression_diversity/std": 0.022448500618338585, + "rewards/symbolic_reward_accuracy/mean": 0.193359375, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.7313476800918579, + "rewards/symbolic_reward_partial_score/std": 0.19797399640083313, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0738534927368164, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 8.171432495117188, + "step": 2441 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3624930679798126, + "epoch": 3.9134615384615383, + "grad_norm": 0.01627454161643982, + "learning_rate": 1e-06, + "loss": 0.0318, + "step": 2442 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3630795478820801, + "epoch": 3.9150641025641026, + "grad_norm": 0.011845182627439499, + "learning_rate": 1e-06, + "loss": 0.0527, + "step": 2443 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3635891079902649, + "epoch": 3.9166666666666665, + "grad_norm": 0.012927143834531307, + "learning_rate": 1e-06, + "loss": 0.0315, + "step": 2444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3619.0, + "completions/mean_length": 2434.078125, + "completions/mean_terminated_length": 2041.91162109375, + "completions/min_length": 687.0, + "completions/min_terminated_length": 687.0, + "entropy": 0.3447739779949188, + "epoch": 3.918269230769231, + "frac_reward_zero_std": 0.40625, + "grad_norm": 580.8510131835938, + "learning_rate": 1e-06, + "loss": 0.0723, + "num_tokens": 1423102786.0, + "reward": 0.39957115054130554, + "reward_std": 0.04453998804092407, + "rewards/progression_diversity/mean": -0.00382626592181623, + "rewards/progression_diversity/std": 0.02524666115641594, + "rewards/symbolic_reward_accuracy/mean": 0.283203125, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.7708333134651184, + "rewards/symbolic_reward_partial_score/std": 0.20369426906108856, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0627055168151855, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 13.541450500488281, + "step": 2445 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3629473149776459, + "epoch": 3.9198717948717947, + "grad_norm": 0.03049568459391594, + "learning_rate": 1e-06, + "loss": 0.041, + "step": 2446 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.360695943236351, + "epoch": 3.921474358974359, + "grad_norm": 0.03725721687078476, + "learning_rate": 1e-06, + "loss": 0.0375, + "step": 2447 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.36237798631191254, + "epoch": 3.9230769230769234, + "grad_norm": 0.01814689300954342, + "learning_rate": 1e-06, + "loss": 0.0465, + "step": 2448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4143.0, + "completions/mean_length": 2448.4453125, + "completions/mean_terminated_length": 2027.8551025390625, + "completions/min_length": 1131.0, + "completions/min_terminated_length": 1131.0, + "entropy": 0.3400779664516449, + "epoch": 3.9246794871794872, + "frac_reward_zero_std": 0.46875, + "grad_norm": 717.636962890625, + "learning_rate": 1e-06, + "loss": 0.1025, + "num_tokens": 1425199558.0, + "reward": 0.3190501034259796, + "reward_std": 0.047453030943870544, + "rewards/progression_diversity/mean": -0.0046582636423408985, + "rewards/progression_diversity/std": 0.029829828068614006, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.7205566167831421, + "rewards/symbolic_reward_partial_score/std": 0.21557864546775818, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0601270198822021, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 14.778732299804688, + "step": 2449 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.36184197664260864, + "epoch": 3.926282051282051, + "grad_norm": 0.07136629521846771, + "learning_rate": 1e-06, + "loss": -0.0179, + "step": 2450 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.36823903024196625, + "epoch": 3.9278846153846154, + "grad_norm": 0.012042498216032982, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 2451 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3471096456050873, + "epoch": 3.9294871794871797, + "grad_norm": 0.020903823897242546, + "learning_rate": 1e-06, + "loss": 0.1012, + "step": 2452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4468.0, + "completions/mean_length": 2371.39453125, + "completions/mean_terminated_length": 2006.336669921875, + "completions/min_length": 1198.0, + "completions/min_terminated_length": 1198.0, + "entropy": 0.3484654426574707, + "epoch": 3.9310897435897436, + "frac_reward_zero_std": 0.4375, + "grad_norm": 2592.323974609375, + "learning_rate": 1e-06, + "loss": 0.0898, + "num_tokens": 1427341920.0, + "reward": 0.32160401344299316, + "reward_std": 0.039857640862464905, + "rewards/progression_diversity/mean": -0.004153084009885788, + "rewards/progression_diversity/std": 0.028649339452385902, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.729052722454071, + "rewards/symbolic_reward_partial_score/std": 0.21416015923023224, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0638084411621094, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 12.2067232131958, + "step": 2453 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.347993940114975, + "epoch": 3.9326923076923075, + "grad_norm": 0.02119533158838749, + "learning_rate": 1e-06, + "loss": 0.0476, + "step": 2454 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3478931188583374, + "epoch": 3.934294871794872, + "grad_norm": 0.014371760189533234, + "learning_rate": 1e-06, + "loss": 0.0499, + "step": 2455 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.35639145970344543, + "epoch": 3.935897435897436, + "grad_norm": 0.015084910206496716, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 2456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3296.0, + "completions/mean_length": 2630.390625, + "completions/mean_terminated_length": 1953.9835205078125, + "completions/min_length": 1124.0, + "completions/min_terminated_length": 1124.0, + "entropy": 0.34472502768039703, + "epoch": 3.9375, + "frac_reward_zero_std": 0.375, + "grad_norm": 823.965576171875, + "learning_rate": 1e-06, + "loss": 0.0218, + "num_tokens": 1429514888.0, + "reward": 0.386918306350708, + "reward_std": 0.04596348851919174, + "rewards/progression_diversity/mean": -0.009828666225075722, + "rewards/progression_diversity/std": 0.04648015275597572, + "rewards/symbolic_reward_accuracy/mean": 0.271484375, + "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, + "rewards/symbolic_reward_partial_score/mean": 0.7503417730331421, + "rewards/symbolic_reward_partial_score/std": 0.21974274516105652, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0469261407852173, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 20.604896545410156, + "step": 2457 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3289848268032074, + "epoch": 3.939102564102564, + "grad_norm": 0.01477957796305418, + "learning_rate": 1e-06, + "loss": 0.131, + "step": 2458 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3502272367477417, + "epoch": 3.940705128205128, + "grad_norm": 0.026974955573678017, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 2459 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.34581419825553894, + "epoch": 3.9423076923076925, + "grad_norm": 0.015963274985551834, + "learning_rate": 1e-06, + "loss": 0.0406, + "step": 2460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3357.0, + "completions/mean_length": 2388.841796875, + "completions/mean_terminated_length": 1937.385009765625, + "completions/min_length": 1023.0, + "completions/min_terminated_length": 1023.0, + "entropy": 0.35383833944797516, + "epoch": 3.9439102564102564, + "frac_reward_zero_std": 0.53125, + "grad_norm": 648.1668090820312, + "learning_rate": 1e-06, + "loss": 0.0584, + "num_tokens": 1431583207.0, + "reward": 0.3101181387901306, + "reward_std": 0.026755383238196373, + "rewards/progression_diversity/mean": -0.00625104084610939, + "rewards/progression_diversity/std": 0.03660469129681587, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.7318522334098816, + "rewards/symbolic_reward_partial_score/std": 0.19400212168693542, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0573663711547852, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 16.051483154296875, + "step": 2461 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.36202606558799744, + "epoch": 3.9455128205128203, + "grad_norm": 0.061913300305604935, + "learning_rate": 1e-06, + "loss": 0.015, + "step": 2462 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.36147141456604004, + "epoch": 3.9471153846153846, + "grad_norm": 0.014589556492865086, + "learning_rate": 1e-06, + "loss": 0.0385, + "step": 2463 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3382275700569153, + "epoch": 3.948717948717949, + "grad_norm": 0.013087283819913864, + "learning_rate": 1e-06, + "loss": 0.0554, + "step": 2464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3255.0, + "completions/mean_length": 2498.75, + "completions/mean_terminated_length": 1963.61865234375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 0.35105569660663605, + "epoch": 3.9503205128205128, + "frac_reward_zero_std": 0.25, + "grad_norm": 1137.3775634765625, + "learning_rate": 1e-06, + "loss": 0.0478, + "num_tokens": 1433752439.0, + "reward": 0.384078711271286, + "reward_std": 0.03686286136507988, + "rewards/progression_diversity/mean": -0.007657800801098347, + "rewards/progression_diversity/std": 0.04040105640888214, + "rewards/symbolic_reward_accuracy/mean": 0.275390625, + "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, + "rewards/symbolic_reward_partial_score/mean": 0.7329915165901184, + "rewards/symbolic_reward_partial_score/std": 0.22255964577198029, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0503382682800293, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 19.593889236450195, + "step": 2465 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3548099845647812, + "epoch": 3.9519230769230766, + "grad_norm": 0.07064145803451538, + "learning_rate": 1e-06, + "loss": 0.0108, + "step": 2466 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3451448976993561, + "epoch": 3.953525641025641, + "grad_norm": 0.03678792715072632, + "learning_rate": 1e-06, + "loss": 0.0902, + "step": 2467 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.34411998093128204, + "epoch": 3.9551282051282053, + "grad_norm": 0.026875555515289307, + "learning_rate": 1e-06, + "loss": 0.0816, + "step": 2468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3373.0, + "completions/mean_length": 2192.21484375, + "completions/mean_terminated_length": 1909.510009765625, + "completions/min_length": 1090.0, + "completions/min_terminated_length": 1090.0, + "entropy": 0.3781355321407318, + "epoch": 3.956730769230769, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01978994905948639, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 1435768037.0, + "reward": 0.2731976807117462, + "reward_std": 0.02287241816520691, + "rewards/progression_diversity/mean": -0.004451290238648653, + "rewards/progression_diversity/std": 0.0341835618019104, + "rewards/symbolic_reward_accuracy/mean": 0.123046875, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.6666666865348816, + "rewards/symbolic_reward_partial_score/std": 0.2050163298845291, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0635859966278076, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 13.839681625366211, + "step": 2469 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3631334751844406, + "epoch": 3.9583333333333335, + "grad_norm": 4.625295639038086, + "learning_rate": 1e-06, + "loss": 0.0434, + "step": 2470 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.35326145589351654, + "epoch": 3.9599358974358974, + "grad_norm": 0.01084475964307785, + "learning_rate": 1e-06, + "loss": 0.0951, + "step": 2471 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3633986711502075, + "epoch": 3.9615384615384617, + "grad_norm": 0.013056226074695587, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 2472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3047.0, + "completions/mean_length": 2316.728515625, + "completions/mean_terminated_length": 1892.1629638671875, + "completions/min_length": 1024.0, + "completions/min_terminated_length": 1024.0, + "entropy": 0.36079515516757965, + "epoch": 3.9631410256410255, + "frac_reward_zero_std": 0.3125, + "grad_norm": 649.272216796875, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 1437927290.0, + "reward": 0.2825604975223541, + "reward_std": 0.047691911458969116, + "rewards/progression_diversity/mean": -0.007623001933097839, + "rewards/progression_diversity/std": 0.04595175012946129, + "rewards/symbolic_reward_accuracy/mean": 0.134765625, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.6758463382720947, + "rewards/symbolic_reward_partial_score/std": 0.2019786536693573, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0513596534729004, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 19.716333389282227, + "step": 2473 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3496604412794113, + "epoch": 3.96474358974359, + "grad_norm": 3.18587327003479, + "learning_rate": 1e-06, + "loss": 0.0646, + "step": 2474 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3459862917661667, + "epoch": 3.9663461538461537, + "grad_norm": 0.016324063763022423, + "learning_rate": 1e-06, + "loss": 0.0452, + "step": 2475 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.35571056604385376, + "epoch": 3.967948717948718, + "grad_norm": 0.014685800299048424, + "learning_rate": 1e-06, + "loss": 0.0737, + "step": 2476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3300.0, + "completions/mean_length": 2103.21484375, + "completions/mean_terminated_length": 1876.5357666015625, + "completions/min_length": 1079.0, + "completions/min_terminated_length": 1079.0, + "entropy": 0.36506491899490356, + "epoch": 3.969551282051282, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.013999047689139843, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 1439892248.0, + "reward": 0.34133556485176086, + "reward_std": 0.026946410536766052, + "rewards/progression_diversity/mean": -0.0046281758695840836, + "rewards/progression_diversity/std": 0.038301728665828705, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.7603353261947632, + "rewards/symbolic_reward_partial_score/std": 0.17550498247146606, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0721997022628784, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 10.131376266479492, + "step": 2477 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.36544330418109894, + "epoch": 3.9711538461538463, + "grad_norm": 0.02787620946764946, + "learning_rate": 1e-06, + "loss": 0.0459, + "step": 2478 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3669802248477936, + "epoch": 3.97275641025641, + "grad_norm": 0.015974465757608414, + "learning_rate": 1e-06, + "loss": 0.0146, + "step": 2479 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.36941133439540863, + "epoch": 3.9743589743589745, + "grad_norm": 0.02147599868476391, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 2480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2934.0, + "completions/mean_length": 3074.95703125, + "completions/mean_terminated_length": 1885.63818359375, + "completions/min_length": 1061.0, + "completions/min_terminated_length": 1061.0, + "entropy": 0.3382083773612976, + "epoch": 3.9759615384615383, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1267.6973876953125, + "learning_rate": 1e-06, + "loss": 0.0402, + "num_tokens": 1442438978.0, + "reward": 0.2858215570449829, + "reward_std": 0.044375061988830566, + "rewards/progression_diversity/mean": -0.023311004042625427, + "rewards/progression_diversity/std": 0.08079320937395096, + "rewards/symbolic_reward_accuracy/mean": 0.134765625, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.6885416507720947, + "rewards/symbolic_reward_partial_score/std": 0.19891256093978882, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.005120038986206, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 43.875850677490234, + "step": 2481 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3479975014925003, + "epoch": 3.9775641025641026, + "grad_norm": 0.0137171084061265, + "learning_rate": 1e-06, + "loss": 0.042, + "step": 2482 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.31158269941806793, + "epoch": 3.9791666666666665, + "grad_norm": 0.028567982837557793, + "learning_rate": 1e-06, + "loss": 0.159, + "step": 2483 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.31820471584796906, + "epoch": 3.980769230769231, + "grad_norm": 0.03962567821145058, + "learning_rate": 1e-06, + "loss": 0.1121, + "step": 2484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3611.0, + "completions/mean_length": 3046.513671875, + "completions/mean_terminated_length": 1946.805419921875, + "completions/min_length": 1105.0, + "completions/min_terminated_length": 1105.0, + "entropy": 0.33995670080184937, + "epoch": 3.9823717948717947, + "frac_reward_zero_std": 0.34375, + "grad_norm": 649.2957763671875, + "learning_rate": 1e-06, + "loss": 0.0291, + "num_tokens": 1444918329.0, + "reward": 0.3851238489151001, + "reward_std": 0.020528025925159454, + "rewards/progression_diversity/mean": -0.020821530371904373, + "rewards/progression_diversity/std": 0.0761089101433754, + "rewards/symbolic_reward_accuracy/mean": 0.28125, + "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, + "rewards/symbolic_reward_partial_score/mean": 0.7258462905883789, + "rewards/symbolic_reward_partial_score/std": 0.23976966738700867, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0180208683013916, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 37.60797882080078, + "step": 2485 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3226911425590515, + "epoch": 3.983974358974359, + "grad_norm": 3.0483925342559814, + "learning_rate": 1e-06, + "loss": 0.1089, + "step": 2486 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3385407626628876, + "epoch": 3.9855769230769234, + "grad_norm": 0.01842661201953888, + "learning_rate": 1e-06, + "loss": 0.0844, + "step": 2487 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3237884044647217, + "epoch": 3.9871794871794872, + "grad_norm": 0.01716519333422184, + "learning_rate": 1e-06, + "loss": 0.1273, + "step": 2488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3178.0, + "completions/mean_length": 3356.56640625, + "completions/mean_terminated_length": 1883.89990234375, + "completions/min_length": 1032.0, + "completions/min_terminated_length": 1032.0, + "entropy": 0.33329902589321136, + "epoch": 3.988782051282051, + "frac_reward_zero_std": 0.34375, + "grad_norm": 1143.9063720703125, + "learning_rate": 1e-06, + "loss": 0.085, + "num_tokens": 1447458059.0, + "reward": 0.48112747073173523, + "reward_std": 0.07644528150558472, + "rewards/progression_diversity/mean": -0.028856143355369568, + "rewards/progression_diversity/std": 0.08837366849184036, + "rewards/symbolic_reward_accuracy/mean": 0.4140625, + "rewards/symbolic_reward_accuracy/std": 0.49304109811782837, + "rewards/symbolic_reward_partial_score/mean": 0.7818033695220947, + "rewards/symbolic_reward_partial_score/std": 0.23644909262657166, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0008317232131958, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 46.746429443359375, + "step": 2489 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3214826136827469, + "epoch": 3.9903846153846154, + "grad_norm": 32.26913833618164, + "learning_rate": 1e-06, + "loss": 0.0057, + "step": 2490 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3154735267162323, + "epoch": 3.9919871794871797, + "grad_norm": 9.52782154083252, + "learning_rate": 1e-06, + "loss": 0.5166, + "step": 2491 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.31883853673934937, + "epoch": 3.9935897435897436, + "grad_norm": 1062.50439453125, + "learning_rate": 1e-06, + "loss": 0.3673, + "step": 2492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3200.0, + "completions/mean_length": 3408.443359375, + "completions/mean_terminated_length": 1941.6412353515625, + "completions/min_length": 1132.0, + "completions/min_terminated_length": 1132.0, + "entropy": 0.3397838622331619, + "epoch": 3.9951923076923075, + "frac_reward_zero_std": 0.21875, + "grad_norm": 321.0043640136719, + "learning_rate": 1e-06, + "loss": 0.0395, + "num_tokens": 1449967230.0, + "reward": 0.4184061884880066, + "reward_std": 0.0656353235244751, + "rewards/progression_diversity/mean": -0.028522029519081116, + "rewards/progression_diversity/std": 0.08747442811727524, + "rewards/symbolic_reward_accuracy/mean": 0.3203125, + "rewards/symbolic_reward_accuracy/std": 0.4670529365539551, + "rewards/symbolic_reward_partial_score/mean": 0.7582682371139526, + "rewards/symbolic_reward_partial_score/std": 0.22663599252700806, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0017540454864502, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 46.01112747192383, + "step": 2493 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3056804835796356, + "epoch": 3.996794871794872, + "grad_norm": 35.7710075378418, + "learning_rate": 1e-06, + "loss": 0.2021, + "step": 2494 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.33890417218208313, + "epoch": 3.998397435897436, + "grad_norm": 0.29032865166664124, + "learning_rate": 1e-06, + "loss": 0.0841, + "step": 2495 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31833402812480927, + "epoch": 4.0, + "grad_norm": 0.019398123025894165, + "learning_rate": 1e-06, + "loss": 0.1324, + "step": 2496 + }, + { + "epoch": 4.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.0859375, + "eval_completions/max_length": 16384.0, + "eval_completions/max_terminated_length": 2986.0625, + "eval_completions/mean_length": 3210.553466796875, + "eval_completions/mean_terminated_length": 1972.495346069336, + "eval_completions/min_length": 1199.8125, + "eval_completions/min_terminated_length": 1199.8125, + "eval_entropy": 0.3062564991414547, + "eval_frac_reward_zero_std": 0.23828125, + "eval_loss": 0.03558493033051491, + "eval_num_tokens": 1449967230.0, + "eval_reward": 0.25646816566586494, + "eval_reward_std": 0.03971813467796892, + "eval_rewards/progression_diversity/mean": -0.022923253040062264, + "eval_rewards/progression_diversity/std": 0.07360347534995526, + "eval_rewards/symbolic_reward_accuracy/mean": 0.093994140625, + "eval_rewards/symbolic_reward_accuracy/std": 0.21256664022803307, + "eval_rewards/symbolic_reward_partial_score/mean": 0.6708434987813234, + "eval_rewards/symbolic_reward_partial_score/std": 0.1871126431506127, + "eval_rewards/tag_count_reward/mean": -0.009521484375, + "eval_rewards/tag_count_reward/std": 0.07448925846256316, + "eval_runtime": 4359.773, + "eval_samples_per_second": 0.057, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.03790351934731, + "eval_sampling/importance_sampling_ratio/min": 0.0, + "eval_sampling/sampling_logp_difference/max": 782.375, + "eval_sampling/sampling_logp_difference/mean": 27.384851962327957, + "eval_steps_per_second": 0.0, + "step": 2496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3075.0, + "completions/mean_length": 2979.962890625, + "completions/mean_terminated_length": 1966.2122802734375, + "completions/min_length": 1032.0, + "completions/min_terminated_length": 1032.0, + "entropy": 0.32689496874809265, + "epoch": 4.001602564102564, + "frac_reward_zero_std": 0.34375, + "grad_norm": 373.3982849121094, + "learning_rate": 1e-06, + "loss": 0.0951, + "num_tokens": 1452348555.0, + "reward": 0.3907577395439148, + "reward_std": 0.044927872717380524, + "rewards/progression_diversity/mean": -0.018952488899230957, + "rewards/progression_diversity/std": 0.0710325539112091, + "rewards/symbolic_reward_accuracy/mean": 0.271484375, + "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, + "rewards/symbolic_reward_partial_score/mean": 0.7653970718383789, + "rewards/symbolic_reward_partial_score/std": 0.23204264044761658, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0241422653198242, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 34.15648651123047, + "step": 2497 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.3391586244106293, + "epoch": 4.003205128205129, + "grad_norm": 0.013818570412695408, + "learning_rate": 1e-06, + "loss": 0.2369, + "step": 2498 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.32884329557418823, + "epoch": 4.0048076923076925, + "grad_norm": 7747.47119140625, + "learning_rate": 1e-06, + "loss": 1.4664, + "step": 2499 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3341674506664276, + "epoch": 4.006410256410256, + "grad_norm": 0.5125430822372437, + "learning_rate": 1e-06, + "loss": 0.2162, + "step": 2500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3270.0, + "completions/mean_length": 3343.3125, + "completions/mean_terminated_length": 1963.196533203125, + "completions/min_length": 1081.0, + "completions/min_terminated_length": 1081.0, + "entropy": 0.3030368685722351, + "epoch": 4.00801282051282, + "frac_reward_zero_std": 0.25, + "grad_norm": 691.9620971679688, + "learning_rate": 1e-06, + "loss": 0.1049, + "num_tokens": 1454962379.0, + "reward": 0.34921398758888245, + "reward_std": 0.05184464529156685, + "rewards/progression_diversity/mean": -0.02537970244884491, + "rewards/progression_diversity/std": 0.08060704171657562, + "rewards/symbolic_reward_accuracy/mean": 0.244140625, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.6792155504226685, + "rewards/symbolic_reward_partial_score/std": 0.23507006466388702, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0067884922027588, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 43.07496643066406, + "step": 2501 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3297167718410492, + "epoch": 4.009615384615385, + "grad_norm": 0.07673323899507523, + "learning_rate": 1e-06, + "loss": 0.0893, + "step": 2502 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3251052349805832, + "epoch": 4.011217948717949, + "grad_norm": 0.011674604378640652, + "learning_rate": 1e-06, + "loss": 0.1128, + "step": 2503 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3353799283504486, + "epoch": 4.012820512820513, + "grad_norm": 0.029069218784570694, + "learning_rate": 1e-06, + "loss": 0.0488, + "step": 2504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3116.0, + "completions/mean_length": 3221.0078125, + "completions/mean_terminated_length": 1983.461669921875, + "completions/min_length": 1074.0, + "completions/min_terminated_length": 1074.0, + "entropy": 0.3239881694316864, + "epoch": 4.014423076923077, + "frac_reward_zero_std": 0.1875, + "grad_norm": 496.55010986328125, + "learning_rate": 1e-06, + "loss": 0.0835, + "num_tokens": 1457508255.0, + "reward": 0.38050615787506104, + "reward_std": 0.04678242653608322, + "rewards/progression_diversity/mean": -0.02165273018181324, + "rewards/progression_diversity/std": 0.07281038910150528, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.740429699420929, + "rewards/symbolic_reward_partial_score/std": 0.20741601288318634, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0144760608673096, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 38.09929656982422, + "step": 2505 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.29394128918647766, + "epoch": 4.016025641025641, + "grad_norm": 0.03365239128470421, + "learning_rate": 1e-06, + "loss": 0.1895, + "step": 2506 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3248167932033539, + "epoch": 4.017628205128205, + "grad_norm": 0.03242243453860283, + "learning_rate": 1e-06, + "loss": 0.0367, + "step": 2507 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3440161794424057, + "epoch": 4.019230769230769, + "grad_norm": 0.023058878257870674, + "learning_rate": 1e-06, + "loss": 0.0453, + "step": 2508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3390.0, + "completions/mean_length": 2804.220703125, + "completions/mean_terminated_length": 1988.8717041015625, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.3260979652404785, + "epoch": 4.020833333333333, + "frac_reward_zero_std": 0.4375, + "grad_norm": 859.6063842773438, + "learning_rate": 1e-06, + "loss": 0.0866, + "num_tokens": 1459937296.0, + "reward": 0.23110386729240417, + "reward_std": 0.02382086217403412, + "rewards/progression_diversity/mean": -0.015101805329322815, + "rewards/progression_diversity/std": 0.06296249479055405, + "rewards/symbolic_reward_accuracy/mean": 0.05859375, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.6582194566726685, + "rewards/symbolic_reward_partial_score/std": 0.18645817041397095, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.035254716873169, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 27.520709991455078, + "step": 2509 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.34093551337718964, + "epoch": 4.022435897435898, + "grad_norm": 2172.674560546875, + "learning_rate": 1e-06, + "loss": 0.2025, + "step": 2510 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3240719437599182, + "epoch": 4.024038461538462, + "grad_norm": 67.2519760131836, + "learning_rate": 1e-06, + "loss": 0.0682, + "step": 2511 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3448804020881653, + "epoch": 4.0256410256410255, + "grad_norm": 0.01443031057715416, + "learning_rate": 1e-06, + "loss": 0.0467, + "step": 2512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3084.0, + "completions/mean_length": 2421.150390625, + "completions/mean_terminated_length": 1941.6182861328125, + "completions/min_length": 1169.0, + "completions/min_terminated_length": 1169.0, + "entropy": 0.3457055538892746, + "epoch": 4.027243589743589, + "frac_reward_zero_std": 0.59375, + "grad_norm": 373.1669616699219, + "learning_rate": 1e-06, + "loss": 0.0384, + "num_tokens": 1462033373.0, + "reward": 0.36610591411590576, + "reward_std": 0.008678528480231762, + "rewards/progression_diversity/mean": -0.008554144762456417, + "rewards/progression_diversity/std": 0.04716269671916962, + "rewards/symbolic_reward_accuracy/mean": 0.25, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.7212890386581421, + "rewards/symbolic_reward_partial_score/std": 0.2121674120426178, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0559264421463013, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 17.93334197998047, + "step": 2513 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.35698336362838745, + "epoch": 4.028846153846154, + "grad_norm": 0.10000839084386826, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 2514 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3595133423805237, + "epoch": 4.030448717948718, + "grad_norm": 0.02287139743566513, + "learning_rate": 1e-06, + "loss": 0.0303, + "step": 2515 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.34900110960006714, + "epoch": 4.032051282051282, + "grad_norm": 0.011496487073600292, + "learning_rate": 1e-06, + "loss": 0.0666, + "step": 2516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3091.0, + "completions/mean_length": 2512.10546875, + "completions/mean_terminated_length": 1977.48876953125, + "completions/min_length": 1222.0, + "completions/min_terminated_length": 1222.0, + "entropy": 0.3486749231815338, + "epoch": 4.033653846153846, + "frac_reward_zero_std": 0.375, + "grad_norm": 842.0504150390625, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 1464206211.0, + "reward": 0.31601575016975403, + "reward_std": 0.02459620125591755, + "rewards/progression_diversity/mean": -0.00828959047794342, + "rewards/progression_diversity/std": 0.04331068694591522, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.7470214366912842, + "rewards/symbolic_reward_partial_score/std": 0.18681012094020844, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0497366189956665, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 21.2126522064209, + "step": 2517 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3538134843111038, + "epoch": 4.035256410256411, + "grad_norm": 0.03899611532688141, + "learning_rate": 1e-06, + "loss": 0.0486, + "step": 2518 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3479982316493988, + "epoch": 4.0368589743589745, + "grad_norm": 0.013328113593161106, + "learning_rate": 1e-06, + "loss": 0.0789, + "step": 2519 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.35109949111938477, + "epoch": 4.038461538461538, + "grad_norm": 0.015972906723618507, + "learning_rate": 1e-06, + "loss": 0.0823, + "step": 2520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3298.0, + "completions/mean_length": 2378.287109375, + "completions/mean_terminated_length": 1867.95751953125, + "completions/min_length": 1063.0, + "completions/min_terminated_length": 1063.0, + "entropy": 0.35145680606365204, + "epoch": 4.040064102564102, + "frac_reward_zero_std": 0.53125, + "grad_norm": 592.3827514648438, + "learning_rate": 1e-06, + "loss": 0.0507, + "num_tokens": 1466305798.0, + "reward": 0.3080458343029022, + "reward_std": 0.013700053095817566, + "rewards/progression_diversity/mean": -0.00889340415596962, + "rewards/progression_diversity/std": 0.04732125252485275, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.7198241949081421, + "rewards/symbolic_reward_partial_score/std": 0.19643016159534454, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0504951477050781, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 20.981121063232422, + "step": 2521 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3594615310430527, + "epoch": 4.041666666666667, + "grad_norm": 0.03896607458591461, + "learning_rate": 1e-06, + "loss": 0.017, + "step": 2522 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.34222954511642456, + "epoch": 4.043269230769231, + "grad_norm": 0.006857524160295725, + "learning_rate": 1e-06, + "loss": 0.0644, + "step": 2523 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.35084451735019684, + "epoch": 4.044871794871795, + "grad_norm": 0.009686310775578022, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 2524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2915.0, + "completions/mean_length": 2450.3046875, + "completions/mean_terminated_length": 1824.7100830078125, + "completions/min_length": 1064.0, + "completions/min_terminated_length": 1064.0, + "entropy": 0.3546312600374222, + "epoch": 4.046474358974359, + "frac_reward_zero_std": 0.53125, + "grad_norm": 310.8623962402344, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 1468479282.0, + "reward": 0.38561779260635376, + "reward_std": 0.018528560176491737, + "rewards/progression_diversity/mean": -0.010976451449096203, + "rewards/progression_diversity/std": 0.05261799693107605, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.7551594972610474, + "rewards/symbolic_reward_partial_score/std": 0.20599855482578278, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0471153259277344, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 22.519685745239258, + "step": 2525 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3387318253517151, + "epoch": 4.048076923076923, + "grad_norm": 0.017549673095345497, + "learning_rate": 1e-06, + "loss": 0.1121, + "step": 2526 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.3566637337207794, + "epoch": 4.049679487179487, + "grad_norm": 0.025295212864875793, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 2527 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.33428405225276947, + "epoch": 4.051282051282051, + "grad_norm": 0.019428903236985207, + "learning_rate": 1e-06, + "loss": 0.0258, + "step": 2528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2715.0, + "completions/mean_length": 3127.302734375, + "completions/mean_terminated_length": 1787.3785400390625, + "completions/min_length": 1011.0, + "completions/min_terminated_length": 1011.0, + "entropy": 0.3295024484395981, + "epoch": 4.052884615384615, + "frac_reward_zero_std": 0.34375, + "grad_norm": 1348.5477294921875, + "learning_rate": 1e-06, + "loss": 0.057, + "num_tokens": 1470886541.0, + "reward": 0.28540563583374023, + "reward_std": 0.019026556983590126, + "rewards/progression_diversity/mean": -0.02584357187151909, + "rewards/progression_diversity/std": 0.08347862958908081, + "rewards/symbolic_reward_accuracy/mean": 0.119140625, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.7152343988418579, + "rewards/symbolic_reward_partial_score/std": 0.19548086822032928, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.010728359222412, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 41.08893585205078, + "step": 2529 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.30960477888584137, + "epoch": 4.05448717948718, + "grad_norm": 96.72750091552734, + "learning_rate": 1e-06, + "loss": 0.0919, + "step": 2530 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.32376880943775177, + "epoch": 4.056089743589744, + "grad_norm": 0.019057314842939377, + "learning_rate": 1e-06, + "loss": 0.0911, + "step": 2531 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.32927900552749634, + "epoch": 4.0576923076923075, + "grad_norm": 0.010551064275205135, + "learning_rate": 1e-06, + "loss": 0.1108, + "step": 2532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2729.0, + "completions/mean_length": 2688.408203125, + "completions/mean_terminated_length": 1775.368896484375, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "entropy": 0.34093132615089417, + "epoch": 4.059294871794871, + "frac_reward_zero_std": 0.375, + "grad_norm": 454.11712646484375, + "learning_rate": 1e-06, + "loss": 0.0975, + "num_tokens": 1473027214.0, + "reward": 0.42107725143432617, + "reward_std": 0.030925702303647995, + "rewards/progression_diversity/mean": -0.01678628847002983, + "rewards/progression_diversity/std": 0.06638278067111969, + "rewards/symbolic_reward_accuracy/mean": 0.3046875, + "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, + "rewards/symbolic_reward_partial_score/mean": 0.7967284917831421, + "rewards/symbolic_reward_partial_score/std": 0.18162044882774353, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0306940078735352, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 30.967906951904297, + "step": 2533 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.32887102663517, + "epoch": 4.060897435897436, + "grad_norm": 0.00896961148828268, + "learning_rate": 1e-06, + "loss": 0.1063, + "step": 2534 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3550725132226944, + "epoch": 4.0625, + "grad_norm": 0.014958408661186695, + "learning_rate": 1e-06, + "loss": 0.069, + "step": 2535 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.34709444642066956, + "epoch": 4.064102564102564, + "grad_norm": 0.013090104795992374, + "learning_rate": 1e-06, + "loss": 0.0225, + "step": 2536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2727.0, + "completions/mean_length": 3024.65625, + "completions/mean_terminated_length": 1737.353271484375, + "completions/min_length": 1045.0, + "completions/min_terminated_length": 1045.0, + "entropy": 0.3089066445827484, + "epoch": 4.065705128205129, + "frac_reward_zero_std": 0.4375, + "grad_norm": 508.8141174316406, + "learning_rate": 1e-06, + "loss": 0.0951, + "num_tokens": 1475532462.0, + "reward": 0.31843122839927673, + "reward_std": 0.011976012028753757, + "rewards/progression_diversity/mean": -0.025041041895747185, + "rewards/progression_diversity/std": 0.08240573108196259, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.6918293833732605, + "rewards/symbolic_reward_partial_score/std": 0.20478877425193787, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0157009363174438, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 37.54655075073242, + "step": 2537 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.31688109040260315, + "epoch": 4.0673076923076925, + "grad_norm": 1613.686279296875, + "learning_rate": 1e-06, + "loss": 0.1161, + "step": 2538 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30626846849918365, + "epoch": 4.068910256410256, + "grad_norm": 0.013836408033967018, + "learning_rate": 1e-06, + "loss": 0.0784, + "step": 2539 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.3128751218318939, + "epoch": 4.07051282051282, + "grad_norm": 0.025374021381139755, + "learning_rate": 1e-06, + "loss": 0.0991, + "step": 2540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2768.0, + "completions/mean_length": 2856.404296875, + "completions/mean_terminated_length": 1741.0211181640625, + "completions/min_length": 999.0, + "completions/min_terminated_length": 999.0, + "entropy": 0.3115910291671753, + "epoch": 4.072115384615385, + "frac_reward_zero_std": 0.34375, + "grad_norm": 627.9964599609375, + "learning_rate": 1e-06, + "loss": 0.1078, + "num_tokens": 1477984861.0, + "reward": 0.29800140857696533, + "reward_std": 0.011926448903977871, + "rewards/progression_diversity/mean": -0.02163849025964737, + "rewards/progression_diversity/std": 0.07751591503620148, + "rewards/symbolic_reward_accuracy/mean": 0.123046875, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.7486165761947632, + "rewards/symbolic_reward_partial_score/std": 0.1641637235879898, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0106605291366577, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 40.304405212402344, + "step": 2541 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3093924969434738, + "epoch": 4.073717948717949, + "grad_norm": 0.008785123936831951, + "learning_rate": 1e-06, + "loss": 0.0982, + "step": 2542 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.33030666410923004, + "epoch": 4.075320512820513, + "grad_norm": 0.017673548310995102, + "learning_rate": 1e-06, + "loss": 0.0605, + "step": 2543 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3170679658651352, + "epoch": 4.076923076923077, + "grad_norm": 0.013326681219041348, + "learning_rate": 1e-06, + "loss": 0.0929, + "step": 2544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2629.0, + "completions/mean_length": 2784.251953125, + "completions/mean_terminated_length": 1724.903076171875, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "entropy": 0.3214297592639923, + "epoch": 4.078525641025641, + "frac_reward_zero_std": 0.375, + "grad_norm": 304.0992431640625, + "learning_rate": 1e-06, + "loss": 0.076, + "num_tokens": 1480289838.0, + "reward": 0.3872213363647461, + "reward_std": 0.013941626995801926, + "rewards/progression_diversity/mean": -0.0229843370616436, + "rewards/progression_diversity/std": 0.08313606679439545, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.79931640625, + "rewards/symbolic_reward_partial_score/std": 0.18186363577842712, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0218315124511719, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 34.574668884277344, + "step": 2545 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3198620527982712, + "epoch": 4.080128205128205, + "grad_norm": 0.009517377242445946, + "learning_rate": 1e-06, + "loss": 0.0768, + "step": 2546 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.30512526631355286, + "epoch": 4.081730769230769, + "grad_norm": 0.018004372715950012, + "learning_rate": 1e-06, + "loss": 0.1049, + "step": 2547 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3438553065061569, + "epoch": 4.083333333333333, + "grad_norm": 0.025911420583724976, + "learning_rate": 1e-06, + "loss": 0.059, + "step": 2548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2753.0, + "completions/mean_length": 2879.19921875, + "completions/mean_terminated_length": 1765.695556640625, + "completions/min_length": 1122.0, + "completions/min_terminated_length": 1122.0, + "entropy": 0.3178988993167877, + "epoch": 4.084935897435898, + "frac_reward_zero_std": 0.25, + "grad_norm": 239.32733154296875, + "learning_rate": 1e-06, + "loss": 0.0345, + "num_tokens": 1482682692.0, + "reward": 0.35247373580932617, + "reward_std": 0.008235493674874306, + "rewards/progression_diversity/mean": -0.025090141221880913, + "rewards/progression_diversity/std": 0.08833190053701401, + "rewards/symbolic_reward_accuracy/mean": 0.21875, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.7382487058639526, + "rewards/symbolic_reward_partial_score/std": 0.19196312129497528, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0059094429016113, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 43.50497055053711, + "step": 2549 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.32996954023838043, + "epoch": 4.086538461538462, + "grad_norm": 0.013742972165346146, + "learning_rate": 1e-06, + "loss": 0.0517, + "step": 2550 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31920306384563446, + "epoch": 4.0881410256410255, + "grad_norm": 0.011623159982264042, + "learning_rate": 1e-06, + "loss": 0.1203, + "step": 2551 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30411607027053833, + "epoch": 4.089743589743589, + "grad_norm": 0.01574169658124447, + "learning_rate": 1e-06, + "loss": 0.1498, + "step": 2552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3040.0, + "completions/mean_length": 2363.705078125, + "completions/mean_terminated_length": 1764.0592041015625, + "completions/min_length": 1029.0, + "completions/min_terminated_length": 1029.0, + "entropy": 0.3521651029586792, + "epoch": 4.091346153846154, + "frac_reward_zero_std": 0.375, + "grad_norm": 662.1588745117188, + "learning_rate": 1e-06, + "loss": 0.0482, + "num_tokens": 1484783181.0, + "reward": 0.26874637603759766, + "reward_std": 0.013555062934756279, + "rewards/progression_diversity/mean": -0.014036715030670166, + "rewards/progression_diversity/std": 0.06895329058170319, + "rewards/symbolic_reward_accuracy/mean": 0.09375, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.7100911140441895, + "rewards/symbolic_reward_partial_score/std": 0.17183727025985718, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0332553386688232, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 29.85314178466797, + "step": 2553 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.34595395624637604, + "epoch": 4.092948717948718, + "grad_norm": 4.502384185791016, + "learning_rate": 1e-06, + "loss": 0.0216, + "step": 2554 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3410833477973938, + "epoch": 4.094551282051282, + "grad_norm": 0.011607046239078045, + "learning_rate": 1e-06, + "loss": 0.0667, + "step": 2555 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.33509866893291473, + "epoch": 4.096153846153846, + "grad_norm": 0.014269710518419743, + "learning_rate": 1e-06, + "loss": 0.0963, + "step": 2556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3428.0, + "completions/mean_length": 2880.39453125, + "completions/mean_terminated_length": 1859.113525390625, + "completions/min_length": 825.0, + "completions/min_terminated_length": 825.0, + "entropy": 0.32943619787693024, + "epoch": 4.097756410256411, + "frac_reward_zero_std": 0.375, + "grad_norm": 1257.3856201171875, + "learning_rate": 1e-06, + "loss": 0.0543, + "num_tokens": 1487087367.0, + "reward": 0.3605473041534424, + "reward_std": 0.030462127178907394, + "rewards/progression_diversity/mean": -0.02095445990562439, + "rewards/progression_diversity/std": 0.07725287228822708, + "rewards/symbolic_reward_accuracy/mean": 0.23046875, + "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, + "rewards/symbolic_reward_partial_score/mean": 0.7422363758087158, + "rewards/symbolic_reward_partial_score/std": 0.19137205183506012, + "rewards/tag_count_reward/mean": -0.001953125, + "rewards/tag_count_reward/std": 0.04419417306780815, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0257246494293213, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 34.513587951660156, + "step": 2557 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3395875543355942, + "epoch": 4.0993589743589745, + "grad_norm": 1.989341139793396, + "learning_rate": 1e-06, + "loss": 0.0551, + "step": 2558 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.33270154893398285, + "epoch": 4.100961538461538, + "grad_norm": 0.0756467655301094, + "learning_rate": 1e-06, + "loss": 0.1137, + "step": 2559 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3363744169473648, + "epoch": 4.102564102564102, + "grad_norm": 0.012534077279269695, + "learning_rate": 1e-06, + "loss": 0.0671, + "step": 2560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3092.0, + "completions/mean_length": 2916.15234375, + "completions/mean_terminated_length": 1927.9454345703125, + "completions/min_length": 1158.0, + "completions/min_terminated_length": 1158.0, + "entropy": 0.33795928955078125, + "epoch": 4.104166666666667, + "frac_reward_zero_std": 0.40625, + "grad_norm": 486.66265869140625, + "learning_rate": 1e-06, + "loss": 0.0406, + "num_tokens": 1489486389.0, + "reward": 0.3729179799556732, + "reward_std": 0.019203029572963715, + "rewards/progression_diversity/mean": -0.019237220287322998, + "rewards/progression_diversity/std": 0.07287276536226273, + "rewards/symbolic_reward_accuracy/mean": 0.248046875, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.7495605945587158, + "rewards/symbolic_reward_partial_score/std": 0.2092960774898529, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0225183963775635, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 35.221961975097656, + "step": 2561 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.33110561966896057, + "epoch": 4.105769230769231, + "grad_norm": 13.403287887573242, + "learning_rate": 1e-06, + "loss": 0.0302, + "step": 2562 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3245306611061096, + "epoch": 4.107371794871795, + "grad_norm": 0.013325260020792484, + "learning_rate": 1e-06, + "loss": 0.1297, + "step": 2563 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3338320553302765, + "epoch": 4.108974358974359, + "grad_norm": 1.518842101097107, + "learning_rate": 1e-06, + "loss": 0.0908, + "step": 2564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.107421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3086.0, + "completions/mean_length": 3535.826171875, + "completions/mean_terminated_length": 1989.547119140625, + "completions/min_length": 1126.0, + "completions/min_terminated_length": 1126.0, + "entropy": 0.31208691000938416, + "epoch": 4.110576923076923, + "frac_reward_zero_std": 0.1875, + "grad_norm": 2256.84619140625, + "learning_rate": 1e-06, + "loss": 0.1228, + "num_tokens": 1492263788.0, + "reward": 0.3425925374031067, + "reward_std": 0.025561640039086342, + "rewards/progression_diversity/mean": -0.030298512428998947, + "rewards/progression_diversity/std": 0.08967959880828857, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.7237141728401184, + "rewards/symbolic_reward_partial_score/std": 0.20818644762039185, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913551807403564, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 50.347373962402344, + "step": 2565 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.3200324475765228, + "epoch": 4.112179487179487, + "grad_norm": 1.371201515197754, + "learning_rate": 1e-06, + "loss": 0.1, + "step": 2566 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.30504705011844635, + "epoch": 4.113782051282051, + "grad_norm": 2.302720308303833, + "learning_rate": 1e-06, + "loss": 0.1468, + "step": 2567 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3112814277410507, + "epoch": 4.115384615384615, + "grad_norm": 7566.38720703125, + "learning_rate": 1e-06, + "loss": 0.2698, + "step": 2568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3291.0, + "completions/mean_length": 3172.8828125, + "completions/mean_terminated_length": 2022.87060546875, + "completions/min_length": 1223.0, + "completions/min_terminated_length": 1223.0, + "entropy": 0.35531145334243774, + "epoch": 4.11698717948718, + "frac_reward_zero_std": 0.15625, + "grad_norm": 797.2550048828125, + "learning_rate": 1e-06, + "loss": 0.0721, + "num_tokens": 1494671600.0, + "reward": 0.32184159755706787, + "reward_std": 0.0444670133292675, + "rewards/progression_diversity/mean": -0.02189837582409382, + "rewards/progression_diversity/std": 0.07626625150442123, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.7167643308639526, + "rewards/symbolic_reward_partial_score/std": 0.23638619482517242, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.007836103439331, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 40.60625457763672, + "step": 2569 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3130183517932892, + "epoch": 4.118589743589744, + "grad_norm": 21.2623291015625, + "learning_rate": 1e-06, + "loss": 0.0972, + "step": 2570 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3290316015481949, + "epoch": 4.1201923076923075, + "grad_norm": 1913.60205078125, + "learning_rate": 1e-06, + "loss": 0.2226, + "step": 2571 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3343227505683899, + "epoch": 4.121794871794871, + "grad_norm": 0.022048160433769226, + "learning_rate": 1e-06, + "loss": 0.1135, + "step": 2572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3234.0, + "completions/mean_length": 3387.615234375, + "completions/mean_terminated_length": 2043.16162109375, + "completions/min_length": 911.0, + "completions/min_terminated_length": 911.0, + "entropy": 0.3383503258228302, + "epoch": 4.123397435897436, + "frac_reward_zero_std": 0.25, + "grad_norm": 572.839111328125, + "learning_rate": 1e-06, + "loss": 0.1123, + "num_tokens": 1497270795.0, + "reward": 0.41935285925865173, + "reward_std": 0.022548135370016098, + "rewards/progression_diversity/mean": -0.02565228007733822, + "rewards/progression_diversity/std": 0.08250828087329865, + "rewards/symbolic_reward_accuracy/mean": 0.310546875, + "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, + "rewards/symbolic_reward_partial_score/mean": 0.7795573472976685, + "rewards/symbolic_reward_partial_score/std": 0.20047199726104736, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0107229948043823, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 37.779056549072266, + "step": 2573 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3201807737350464, + "epoch": 4.125, + "grad_norm": 0.01932665705680847, + "learning_rate": 1e-06, + "loss": 0.1213, + "step": 2574 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.31810304522514343, + "epoch": 4.126602564102564, + "grad_norm": 0.022387662902474403, + "learning_rate": 1e-06, + "loss": 0.1087, + "step": 2575 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3269665241241455, + "epoch": 4.128205128205128, + "grad_norm": 0.013768312521278858, + "learning_rate": 1e-06, + "loss": 0.1118, + "step": 2576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3283.0, + "completions/mean_length": 2913.330078125, + "completions/mean_terminated_length": 2015.2855224609375, + "completions/min_length": 1204.0, + "completions/min_terminated_length": 1204.0, + "entropy": 0.3411347270011902, + "epoch": 4.1298076923076925, + "frac_reward_zero_std": 0.25, + "grad_norm": 542.296875, + "learning_rate": 1e-06, + "loss": 0.0579, + "num_tokens": 1499596596.0, + "reward": 0.2602491080760956, + "reward_std": 0.025792162865400314, + "rewards/progression_diversity/mean": -0.015127346850931644, + "rewards/progression_diversity/std": 0.06053609028458595, + "rewards/symbolic_reward_accuracy/mean": 0.095703125, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.6778970956802368, + "rewards/symbolic_reward_partial_score/std": 0.19333595037460327, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0344008207321167, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 27.88906478881836, + "step": 2577 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3439747095108032, + "epoch": 4.131410256410256, + "grad_norm": 0.02023777738213539, + "learning_rate": 1e-06, + "loss": 0.0927, + "step": 2578 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.34476563334465027, + "epoch": 4.13301282051282, + "grad_norm": 7.5225725173950195, + "learning_rate": 1e-06, + "loss": 0.0408, + "step": 2579 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3402794599533081, + "epoch": 4.134615384615385, + "grad_norm": 0.021249419078230858, + "learning_rate": 1e-06, + "loss": 0.0551, + "step": 2580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3805.0, + "completions/mean_length": 2671.884765625, + "completions/mean_terminated_length": 1967.9775390625, + "completions/min_length": 597.0, + "completions/min_terminated_length": 597.0, + "entropy": 0.33940988779067993, + "epoch": 4.136217948717949, + "frac_reward_zero_std": 0.21875, + "grad_norm": 498.6994323730469, + "learning_rate": 1e-06, + "loss": 0.0785, + "num_tokens": 1501793977.0, + "reward": 0.3605765700340271, + "reward_std": 0.06087111681699753, + "rewards/progression_diversity/mean": -0.01119298581033945, + "rewards/progression_diversity/std": 0.05226728320121765, + "rewards/symbolic_reward_accuracy/mean": 0.2265625, + "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, + "rewards/symbolic_reward_partial_score/mean": 0.7517740726470947, + "rewards/symbolic_reward_partial_score/std": 0.18635277450084686, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0438597202301025, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 23.59453582763672, + "step": 2581 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3429712653160095, + "epoch": 4.137820512820513, + "grad_norm": 472.5995788574219, + "learning_rate": 1e-06, + "loss": 0.0979, + "step": 2582 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3512907773256302, + "epoch": 4.139423076923077, + "grad_norm": 0.022063203155994415, + "learning_rate": 1e-06, + "loss": 0.0575, + "step": 2583 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.36512602865695953, + "epoch": 4.141025641025641, + "grad_norm": 0.027568161487579346, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 2584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3768.0, + "completions/mean_length": 2782.953125, + "completions/mean_terminated_length": 1966.3271484375, + "completions/min_length": 1182.0, + "completions/min_terminated_length": 1182.0, + "entropy": 0.33962003886699677, + "epoch": 4.142628205128205, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1543.4376220703125, + "learning_rate": 1e-06, + "loss": 0.0607, + "num_tokens": 1504024065.0, + "reward": 0.3242529630661011, + "reward_std": 0.023926857858896255, + "rewards/progression_diversity/mean": -0.014647168107330799, + "rewards/progression_diversity/std": 0.0613836906850338, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.716748058795929, + "rewards/symbolic_reward_partial_score/std": 0.1955670863389969, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.031816005706787, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 30.275238037109375, + "step": 2585 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.33402132987976074, + "epoch": 4.144230769230769, + "grad_norm": 144.89903259277344, + "learning_rate": 1e-06, + "loss": 1.3615, + "step": 2586 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3640512526035309, + "epoch": 4.145833333333333, + "grad_norm": 0.025912059471011162, + "learning_rate": 1e-06, + "loss": 0.0157, + "step": 2587 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.346598282456398, + "epoch": 4.147435897435898, + "grad_norm": 0.017162565141916275, + "learning_rate": 1e-06, + "loss": 0.065, + "step": 2588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3701.0, + "completions/mean_length": 3073.26953125, + "completions/mean_terminated_length": 2006.1644287109375, + "completions/min_length": 1134.0, + "completions/min_terminated_length": 1134.0, + "entropy": 0.32446981966495514, + "epoch": 4.149038461538462, + "frac_reward_zero_std": 0.28125, + "grad_norm": 570.8603515625, + "learning_rate": 1e-06, + "loss": 0.1123, + "num_tokens": 1506499115.0, + "reward": 0.3911179006099701, + "reward_std": 0.021029271185398102, + "rewards/progression_diversity/mean": -0.018583200871944427, + "rewards/progression_diversity/std": 0.06753162294626236, + "rewards/symbolic_reward_accuracy/mean": 0.27734375, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.7516113519668579, + "rewards/symbolic_reward_partial_score/std": 0.21333743631839752, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0214645862579346, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 35.46625900268555, + "step": 2589 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.35942013561725616, + "epoch": 4.1506410256410255, + "grad_norm": 4977.77294921875, + "learning_rate": 1e-06, + "loss": 0.2231, + "step": 2590 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3404110074043274, + "epoch": 4.152243589743589, + "grad_norm": 97.81608581542969, + "learning_rate": 1e-06, + "loss": 0.1382, + "step": 2591 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3034721314907074, + "epoch": 4.153846153846154, + "grad_norm": 10592.0458984375, + "learning_rate": 1e-06, + "loss": 0.6398, + "step": 2592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3079.0, + "completions/mean_length": 3393.611328125, + "completions/mean_terminated_length": 1925.132568359375, + "completions/min_length": 1143.0, + "completions/min_terminated_length": 1143.0, + "entropy": 0.314235121011734, + "epoch": 4.155448717948718, + "frac_reward_zero_std": 0.25, + "grad_norm": 377.22088623046875, + "learning_rate": 1e-06, + "loss": 0.0613, + "num_tokens": 1509122212.0, + "reward": 0.3483143150806427, + "reward_std": 0.027692176401615143, + "rewards/progression_diversity/mean": -0.024526942521333694, + "rewards/progression_diversity/std": 0.07418946921825409, + "rewards/symbolic_reward_accuracy/mean": 0.216796875, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.7308756113052368, + "rewards/symbolic_reward_partial_score/std": 0.21280378103256226, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.010713815689087, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 40.72765350341797, + "step": 2593 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3067990243434906, + "epoch": 4.157051282051282, + "grad_norm": 645088.0625, + "learning_rate": 1e-06, + "loss": 45.8723, + "step": 2594 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3317181318998337, + "epoch": 4.158653846153846, + "grad_norm": 10997.2705078125, + "learning_rate": 1e-06, + "loss": 1.253, + "step": 2595 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3151821494102478, + "epoch": 4.160256410256411, + "grad_norm": 50563.86328125, + "learning_rate": 1e-06, + "loss": 9.5746, + "step": 2596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.19140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3123.0, + "completions/mean_length": 4673.439453125, + "completions/mean_terminated_length": 1901.3743896484375, + "completions/min_length": 1096.0, + "completions/min_terminated_length": 1096.0, + "entropy": 0.2506090775132179, + "epoch": 4.1618589743589745, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1328.9156494140625, + "learning_rate": 1e-06, + "loss": 0.1149, + "num_tokens": 1512400453.0, + "reward": 0.44010183215141296, + "reward_std": 0.06620557606220245, + "rewards/progression_diversity/mean": -0.042061757296323776, + "rewards/progression_diversity/std": 0.08827082067728043, + "rewards/symbolic_reward_accuracy/mean": 0.361328125, + "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, + "rewards/symbolic_reward_partial_score/mean": 0.7477050423622131, + "rewards/symbolic_reward_partial_score/std": 0.23697598278522491, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9721221923828125, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 55.41648864746094, + "step": 2597 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.24983707815408707, + "epoch": 4.163461538461538, + "grad_norm": 100176.0, + "learning_rate": 1e-06, + "loss": 29.2153, + "step": 2598 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2740893214941025, + "epoch": 4.165064102564102, + "grad_norm": 18222.609375, + "learning_rate": 1e-06, + "loss": 1.2944, + "step": 2599 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3012789934873581, + "epoch": 4.166666666666667, + "grad_norm": 35.68260192871094, + "learning_rate": 1e-06, + "loss": 0.0814, + "step": 2600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.208984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2746.0, + "completions/mean_length": 4911.619140625, + "completions/mean_terminated_length": 1880.64453125, + "completions/min_length": 1124.0, + "completions/min_terminated_length": 1124.0, + "entropy": 0.24826913326978683, + "epoch": 4.168269230769231, + "frac_reward_zero_std": 0.03125, + "grad_norm": 513.0524291992188, + "learning_rate": 1e-06, + "loss": 0.1546, + "num_tokens": 1515790834.0, + "reward": 0.3871710002422333, + "reward_std": 0.04217392951250076, + "rewards/progression_diversity/mean": -0.0426679290831089, + "rewards/progression_diversity/std": 0.08528563380241394, + "rewards/symbolic_reward_accuracy/mean": 0.27734375, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.7399088740348816, + "rewards/symbolic_reward_partial_score/std": 0.21892520785331726, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9704339504241943, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 54.93540573120117, + "step": 2601 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2412198781967163, + "epoch": 4.169871794871795, + "grad_norm": 26214.958984375, + "learning_rate": 1e-06, + "loss": 2.6327, + "step": 2602 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.29308077692985535, + "epoch": 4.171474358974359, + "grad_norm": 19403.22265625, + "learning_rate": 1e-06, + "loss": 0.3375, + "step": 2603 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2676732540130615, + "epoch": 4.173076923076923, + "grad_norm": 0.013621392659842968, + "learning_rate": 1e-06, + "loss": 0.1484, + "step": 2604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.20703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3097.0, + "completions/mean_length": 4899.4921875, + "completions/mean_terminated_length": 1901.0738525390625, + "completions/min_length": 1051.0, + "completions/min_terminated_length": 1051.0, + "entropy": 0.2361399084329605, + "epoch": 4.174679487179487, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1089.3848876953125, + "learning_rate": 1e-06, + "loss": 0.1781, + "num_tokens": 1519098398.0, + "reward": 0.32966169714927673, + "reward_std": 0.037758007645606995, + "rewards/progression_diversity/mean": -0.039204493165016174, + "rewards/progression_diversity/std": 0.07915779203176498, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.731689453125, + "rewards/symbolic_reward_partial_score/std": 0.20694516599178314, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9641684889793396, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 58.79726028442383, + "step": 2605 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.29851216077804565, + "epoch": 4.176282051282051, + "grad_norm": 0.012859205715358257, + "learning_rate": 1e-06, + "loss": 0.0952, + "step": 2606 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.26364219933748245, + "epoch": 4.177884615384615, + "grad_norm": 0.01124588306993246, + "learning_rate": 1e-06, + "loss": 0.1605, + "step": 2607 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.25536324083805084, + "epoch": 4.17948717948718, + "grad_norm": 0.014565329998731613, + "learning_rate": 1e-06, + "loss": 0.1756, + "step": 2608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.263671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3179.0, + "completions/mean_length": 5711.21484375, + "completions/mean_terminated_length": 1889.3951416015625, + "completions/min_length": 939.0, + "completions/min_terminated_length": 939.0, + "entropy": 0.2375631034374237, + "epoch": 4.181089743589744, + "frac_reward_zero_std": 0.0, + "grad_norm": 1342.1802978515625, + "learning_rate": 1e-06, + "loss": 0.1399, + "num_tokens": 1522799932.0, + "reward": 0.375043660402298, + "reward_std": 0.06779703497886658, + "rewards/progression_diversity/mean": -0.05374106019735336, + "rewards/progression_diversity/std": 0.09237094968557358, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.7239420413970947, + "rewards/symbolic_reward_partial_score/std": 0.2347642481327057, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.932857871055603, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 76.66647338867188, + "step": 2609 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2648901790380478, + "epoch": 4.1826923076923075, + "grad_norm": 23.981592178344727, + "learning_rate": 1e-06, + "loss": 0.1384, + "step": 2610 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.2275170087814331, + "epoch": 4.184294871794872, + "grad_norm": 0.01362849585711956, + "learning_rate": 1e-06, + "loss": 0.2444, + "step": 2611 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.21743150800466537, + "epoch": 4.185897435897436, + "grad_norm": 0.012125054374337196, + "learning_rate": 1e-06, + "loss": 0.2923, + "step": 2612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3040.0, + "completions/mean_length": 5987.09375, + "completions/mean_terminated_length": 1918.7391357421875, + "completions/min_length": 973.0, + "completions/min_terminated_length": 973.0, + "entropy": 0.24884148687124252, + "epoch": 4.1875, + "frac_reward_zero_std": 0.0625, + "grad_norm": 962.3455810546875, + "learning_rate": 1e-06, + "loss": 0.1161, + "num_tokens": 1526650092.0, + "reward": 0.39309823513031006, + "reward_std": 0.0952787920832634, + "rewards/progression_diversity/mean": -0.05785566568374634, + "rewards/progression_diversity/std": 0.09512782096862793, + "rewards/symbolic_reward_accuracy/mean": 0.279296875, + "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, + "rewards/symbolic_reward_partial_score/mean": 0.759521484375, + "rewards/symbolic_reward_partial_score/std": 0.2274795025587082, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.934838056564331, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 76.90158081054688, + "step": 2613 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.19819628447294235, + "epoch": 4.189102564102564, + "grad_norm": 79746.578125, + "learning_rate": 1e-06, + "loss": 0.2315, + "step": 2614 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.23601322621107101, + "epoch": 4.190705128205128, + "grad_norm": 0.25907084345817566, + "learning_rate": 1e-06, + "loss": 0.1821, + "step": 2615 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.23588327318429947, + "epoch": 4.1923076923076925, + "grad_norm": 0.07316934317350388, + "learning_rate": 1e-06, + "loss": 0.1802, + "step": 2616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.279296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2953.0, + "completions/mean_length": 5958.806640625, + "completions/mean_terminated_length": 1918.6910400390625, + "completions/min_length": 1163.0, + "completions/min_terminated_length": 1163.0, + "entropy": 0.21116046607494354, + "epoch": 4.193910256410256, + "frac_reward_zero_std": 0.0, + "grad_norm": 1958.6832275390625, + "learning_rate": 1e-06, + "loss": 0.1101, + "num_tokens": 1530549113.0, + "reward": 0.3267575204372406, + "reward_std": 0.08475440740585327, + "rewards/progression_diversity/mean": -0.06009019538760185, + "rewards/progression_diversity/std": 0.09868360310792923, + "rewards/symbolic_reward_accuracy/mean": 0.193359375, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.7077311277389526, + "rewards/symbolic_reward_partial_score/std": 0.24380625784397125, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9315183758735657, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 78.36473083496094, + "step": 2617 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.22109153121709824, + "epoch": 4.19551282051282, + "grad_norm": 3.5450334548950195, + "learning_rate": 1e-06, + "loss": 0.2224, + "step": 2618 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.24128204584121704, + "epoch": 4.197115384615385, + "grad_norm": 0.01579132489860058, + "learning_rate": 1e-06, + "loss": 0.1629, + "step": 2619 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.22772854566574097, + "epoch": 4.198717948717949, + "grad_norm": 0.02087291143834591, + "learning_rate": 1e-06, + "loss": 0.0817, + "step": 2620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.279296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4516.0, + "completions/mean_length": 5997.45703125, + "completions/mean_terminated_length": 1972.31982421875, + "completions/min_length": 1246.0, + "completions/min_terminated_length": 1246.0, + "entropy": 0.2100074663758278, + "epoch": 4.200320512820513, + "frac_reward_zero_std": 0.0, + "grad_norm": 1239.959228515625, + "learning_rate": 1e-06, + "loss": 0.1074, + "num_tokens": 1534550627.0, + "reward": 0.26383617520332336, + "reward_std": 0.05979030951857567, + "rewards/progression_diversity/mean": -0.06218412518501282, + "rewards/progression_diversity/std": 0.1025165319442749, + "rewards/symbolic_reward_accuracy/mean": 0.103515625, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.6784017086029053, + "rewards/symbolic_reward_partial_score/std": 0.21352458000183105, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9165331125259399, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 87.43069458007812, + "step": 2621 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.24080143868923187, + "epoch": 4.201923076923077, + "grad_norm": 2.7573001384735107, + "learning_rate": 1e-06, + "loss": 0.1249, + "step": 2622 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.21514707058668137, + "epoch": 4.203525641025641, + "grad_norm": 0.024495212361216545, + "learning_rate": 1e-06, + "loss": 0.1772, + "step": 2623 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.20372025668621063, + "epoch": 4.205128205128205, + "grad_norm": 0.013811548240482807, + "learning_rate": 1e-06, + "loss": 0.2065, + "step": 2624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.28515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3565.0, + "completions/mean_length": 6081.05078125, + "completions/mean_terminated_length": 1971.131103515625, + "completions/min_length": 945.0, + "completions/min_terminated_length": 945.0, + "entropy": 0.21161595731973648, + "epoch": 4.206730769230769, + "frac_reward_zero_std": 0.0, + "grad_norm": 856.1641845703125, + "learning_rate": 1e-06, + "loss": 0.154, + "num_tokens": 1538566221.0, + "reward": 0.3104988634586334, + "reward_std": 0.025025444105267525, + "rewards/progression_diversity/mean": -0.06339757144451141, + "rewards/progression_diversity/std": 0.10305111110210419, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.7278645634651184, + "rewards/symbolic_reward_partial_score/std": 0.1853560358285904, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9161036014556885, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 87.99066925048828, + "step": 2625 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.23871337622404099, + "epoch": 4.208333333333333, + "grad_norm": 36.576744079589844, + "learning_rate": 1e-06, + "loss": 0.1355, + "step": 2626 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.21872205287218094, + "epoch": 4.209935897435898, + "grad_norm": 27.23736000061035, + "learning_rate": 1e-06, + "loss": 0.1091, + "step": 2627 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.1995127573609352, + "epoch": 4.211538461538462, + "grad_norm": 0.022180048748850822, + "learning_rate": 1e-06, + "loss": 0.2113, + "step": 2628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.310546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4055.0, + "completions/mean_length": 6442.0234375, + "completions/mean_terminated_length": 1963.9093017578125, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "entropy": 0.20760582387447357, + "epoch": 4.2131410256410255, + "frac_reward_zero_std": 0.0, + "grad_norm": 3401.139404296875, + "learning_rate": 1e-06, + "loss": 0.1055, + "num_tokens": 1542709657.0, + "reward": 0.2375459372997284, + "reward_std": 0.05149232968688011, + "rewards/progression_diversity/mean": -0.07206659764051437, + "rewards/progression_diversity/std": 0.11053474247455597, + "rewards/symbolic_reward_accuracy/mean": 0.052734375, + "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, + "rewards/symbolic_reward_partial_score/mean": 0.6946126222610474, + "rewards/symbolic_reward_partial_score/std": 0.21639835834503174, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9067912697792053, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 92.28494262695312, + "step": 2629 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.1819758117198944, + "epoch": 4.214743589743589, + "grad_norm": 1448.0584716796875, + "learning_rate": 1e-06, + "loss": 0.3297, + "step": 2630 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.22094269841909409, + "epoch": 4.216346153846154, + "grad_norm": 0.1524176448583603, + "learning_rate": 1e-06, + "loss": 0.1131, + "step": 2631 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.21019763499498367, + "epoch": 4.217948717948718, + "grad_norm": 0.015752186998724937, + "learning_rate": 1e-06, + "loss": 0.1985, + "step": 2632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.330078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3971.0, + "completions/mean_length": 6724.53125, + "completions/mean_terminated_length": 1965.2012939453125, + "completions/min_length": 1115.0, + "completions/min_terminated_length": 1115.0, + "entropy": 0.19603969156742096, + "epoch": 4.219551282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1366.21484375, + "learning_rate": 1e-06, + "loss": 0.1466, + "num_tokens": 1547020169.0, + "reward": 0.35085177421569824, + "reward_std": 0.07054072618484497, + "rewards/progression_diversity/mean": -0.07302698493003845, + "rewards/progression_diversity/std": 0.10713056474924088, + "rewards/symbolic_reward_accuracy/mean": 0.232421875, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.7110025882720947, + "rewards/symbolic_reward_partial_score/std": 0.22401832044124603, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.910980224609375, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 90.13082122802734, + "step": 2633 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.19943545758724213, + "epoch": 4.221153846153846, + "grad_norm": 25.857837677001953, + "learning_rate": 1e-06, + "loss": 0.1302, + "step": 2634 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.21115753799676895, + "epoch": 4.222756410256411, + "grad_norm": 0.014513122849166393, + "learning_rate": 1e-06, + "loss": 0.1383, + "step": 2635 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.1924782246351242, + "epoch": 4.2243589743589745, + "grad_norm": 0.024281147867441177, + "learning_rate": 1e-06, + "loss": 0.1726, + "step": 2636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.255859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3878.0, + "completions/mean_length": 5678.255859375, + "completions/mean_terminated_length": 1997.2781982421875, + "completions/min_length": 1170.0, + "completions/min_terminated_length": 1170.0, + "entropy": 0.21415793150663376, + "epoch": 4.225961538461538, + "frac_reward_zero_std": 0.0, + "grad_norm": 1601.75390625, + "learning_rate": 1e-06, + "loss": 0.0806, + "num_tokens": 1550778716.0, + "reward": 0.34982889890670776, + "reward_std": 0.06202153116464615, + "rewards/progression_diversity/mean": -0.05031656473875046, + "rewards/progression_diversity/std": 0.0884041115641594, + "rewards/symbolic_reward_accuracy/mean": 0.232421875, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.7055338621139526, + "rewards/symbolic_reward_partial_score/std": 0.22613964974880219, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9257657527923584, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 81.70069885253906, + "step": 2637 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.22891773283481598, + "epoch": 4.227564102564102, + "grad_norm": 2865.813232421875, + "learning_rate": 1e-06, + "loss": 0.0946, + "step": 2638 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.21290214359760284, + "epoch": 4.229166666666667, + "grad_norm": 9.31295394897461, + "learning_rate": 1e-06, + "loss": 0.194, + "step": 2639 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2276032641530037, + "epoch": 4.230769230769231, + "grad_norm": 863.3584594726562, + "learning_rate": 1e-06, + "loss": 0.2034, + "step": 2640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4074.0, + "completions/mean_length": 6243.08203125, + "completions/mean_terminated_length": 1961.3612060546875, + "completions/min_length": 1208.0, + "completions/min_terminated_length": 1208.0, + "entropy": 0.18027716875076294, + "epoch": 4.232371794871795, + "frac_reward_zero_std": 0.0, + "grad_norm": 1586.920654296875, + "learning_rate": 1e-06, + "loss": 0.1933, + "num_tokens": 1554829782.0, + "reward": 0.5201883912086487, + "reward_std": 0.06066072732210159, + "rewards/progression_diversity/mean": -0.05831076577305794, + "rewards/progression_diversity/std": 0.09286186844110489, + "rewards/symbolic_reward_accuracy/mean": 0.453125, + "rewards/symbolic_reward_accuracy/std": 0.4982847273349762, + "rewards/symbolic_reward_partial_score/mean": 0.8316080570220947, + "rewards/symbolic_reward_partial_score/std": 0.2016787976026535, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9149261116981506, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 86.6026611328125, + "step": 2641 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2154352068901062, + "epoch": 4.233974358974359, + "grad_norm": 0.13046927750110626, + "learning_rate": 1e-06, + "loss": 0.114, + "step": 2642 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.20094606280326843, + "epoch": 4.235576923076923, + "grad_norm": 0.010223207995295525, + "learning_rate": 1e-06, + "loss": 0.1366, + "step": 2643 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.20492058992385864, + "epoch": 4.237179487179487, + "grad_norm": 0.01739051379263401, + "learning_rate": 1e-06, + "loss": 0.156, + "step": 2644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3361.0, + "completions/mean_length": 5930.7109375, + "completions/mean_terminated_length": 1996.6773681640625, + "completions/min_length": 755.0, + "completions/min_terminated_length": 755.0, + "entropy": 0.21672135591506958, + "epoch": 4.238782051282051, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1050.203125, + "learning_rate": 1e-06, + "loss": 0.1747, + "num_tokens": 1558679986.0, + "reward": 0.4322790503501892, + "reward_std": 0.07013079524040222, + "rewards/progression_diversity/mean": -0.05236773565411568, + "rewards/progression_diversity/std": 0.08915333449840546, + "rewards/symbolic_reward_accuracy/mean": 0.35546875, + "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, + "rewards/symbolic_reward_partial_score/mean": 0.7395508289337158, + "rewards/symbolic_reward_partial_score/std": 0.2565183639526367, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9261719584465027, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 78.85722351074219, + "step": 2645 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.2054838389158249, + "epoch": 4.240384615384615, + "grad_norm": 109.77615356445312, + "learning_rate": 1e-06, + "loss": 0.1925, + "step": 2646 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.24229851365089417, + "epoch": 4.24198717948718, + "grad_norm": 0.40434131026268005, + "learning_rate": 1e-06, + "loss": 0.1406, + "step": 2647 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.21324530988931656, + "epoch": 4.243589743589744, + "grad_norm": 1434.3251953125, + "learning_rate": 1e-06, + "loss": 0.2993, + "step": 2648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.287109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3740.0, + "completions/mean_length": 6136.265625, + "completions/mean_terminated_length": 2009.095947265625, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "entropy": 0.20269504189491272, + "epoch": 4.2451923076923075, + "frac_reward_zero_std": 0.0, + "grad_norm": 778.6556396484375, + "learning_rate": 1e-06, + "loss": 0.1087, + "num_tokens": 1562723466.0, + "reward": 0.3345257341861725, + "reward_std": 0.0690338984131813, + "rewards/progression_diversity/mean": -0.054263561964035034, + "rewards/progression_diversity/std": 0.08993817865848541, + "rewards/symbolic_reward_accuracy/mean": 0.21875, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.6839518547058105, + "rewards/symbolic_reward_partial_score/std": 0.21551425755023956, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9316227436065674, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 74.91038513183594, + "step": 2649 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.197452612221241, + "epoch": 4.246794871794872, + "grad_norm": 7070.578125, + "learning_rate": 1e-06, + "loss": 0.9423, + "step": 2650 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.21947723627090454, + "epoch": 4.248397435897436, + "grad_norm": 1.046020746231079, + "learning_rate": 1e-06, + "loss": 0.2003, + "step": 2651 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.21606357395648956, + "epoch": 4.25, + "grad_norm": 0.011754968203604221, + "learning_rate": 1e-06, + "loss": 0.1609, + "step": 2652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.23046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4284.0, + "completions/mean_length": 5343.4765625, + "completions/mean_terminated_length": 2036.923828125, + "completions/min_length": 958.0, + "completions/min_terminated_length": 958.0, + "entropy": 0.2657729983329773, + "epoch": 4.251602564102564, + "frac_reward_zero_std": 0.0, + "grad_norm": 1515.29736328125, + "learning_rate": 1e-06, + "loss": 0.0791, + "num_tokens": 1566282654.0, + "reward": 0.28956276178359985, + "reward_std": 0.050334710627794266, + "rewards/progression_diversity/mean": -0.04128318279981613, + "rewards/progression_diversity/std": 0.08098018169403076, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.7276529669761658, + "rewards/symbolic_reward_partial_score/std": 0.1922309845685959, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9493874907493591, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 63.77783203125, + "step": 2653 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2197081595659256, + "epoch": 4.253205128205128, + "grad_norm": 0.02242899127304554, + "learning_rate": 1e-06, + "loss": 0.192, + "step": 2654 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.220329187810421, + "epoch": 4.2548076923076925, + "grad_norm": 0.013821829110383987, + "learning_rate": 1e-06, + "loss": 0.1641, + "step": 2655 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.21346864849328995, + "epoch": 4.256410256410256, + "grad_norm": 0.020072348415851593, + "learning_rate": 1e-06, + "loss": 0.2551, + "step": 2656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.279296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4037.0, + "completions/mean_length": 6021.244140625, + "completions/mean_terminated_length": 2005.3251953125, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "entropy": 0.2017608880996704, + "epoch": 4.25801282051282, + "frac_reward_zero_std": 0.0, + "grad_norm": 1055.4764404296875, + "learning_rate": 1e-06, + "loss": 0.1218, + "num_tokens": 1570270299.0, + "reward": 0.3231794834136963, + "reward_std": 0.05370699614286423, + "rewards/progression_diversity/mean": -0.05509728938341141, + "rewards/progression_diversity/std": 0.09356637299060822, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.7047525644302368, + "rewards/symbolic_reward_partial_score/std": 0.21620500087738037, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.935107946395874, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 70.60637664794922, + "step": 2657 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.23436833918094635, + "epoch": 4.259615384615385, + "grad_norm": 139.91903686523438, + "learning_rate": 1e-06, + "loss": 0.1392, + "step": 2658 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.1886296272277832, + "epoch": 4.261217948717949, + "grad_norm": 23.106496810913086, + "learning_rate": 1e-06, + "loss": 0.2419, + "step": 2659 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2260531634092331, + "epoch": 4.262820512820513, + "grad_norm": 0.19578726589679718, + "learning_rate": 1e-06, + "loss": 0.1691, + "step": 2660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.287109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3542.0, + "completions/mean_length": 6165.703125, + "completions/mean_terminated_length": 2050.38916015625, + "completions/min_length": 1040.0, + "completions/min_terminated_length": 1040.0, + "entropy": 0.19962295144796371, + "epoch": 4.264423076923077, + "frac_reward_zero_std": 0.0, + "grad_norm": 7102.345703125, + "learning_rate": 1e-06, + "loss": 0.1947, + "num_tokens": 1574379123.0, + "reward": 0.31796449422836304, + "reward_std": 0.04501751810312271, + "rewards/progression_diversity/mean": -0.06097230315208435, + "rewards/progression_diversity/std": 0.10001429170370102, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.7155599594116211, + "rewards/symbolic_reward_partial_score/std": 0.19974127411842346, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9347620606422424, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 71.3311767578125, + "step": 2661 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.20831771194934845, + "epoch": 4.266025641025641, + "grad_norm": 1.6806011199951172, + "learning_rate": 1e-06, + "loss": 0.2179, + "step": 2662 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.2336948812007904, + "epoch": 4.267628205128205, + "grad_norm": 0.07298646867275238, + "learning_rate": 1e-06, + "loss": 0.1278, + "step": 2663 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2085075080394745, + "epoch": 4.269230769230769, + "grad_norm": 0.7998039722442627, + "learning_rate": 1e-06, + "loss": 0.1634, + "step": 2664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.259765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3341.0, + "completions/mean_length": 5808.7734375, + "completions/mean_terminated_length": 2097.67822265625, + "completions/min_length": 1006.0, + "completions/min_terminated_length": 1006.0, + "entropy": 0.21773767471313477, + "epoch": 4.270833333333333, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1245.658203125, + "learning_rate": 1e-06, + "loss": 0.1845, + "num_tokens": 1578336351.0, + "reward": 0.32293805480003357, + "reward_std": 0.05607787147164345, + "rewards/progression_diversity/mean": -0.048967085778713226, + "rewards/progression_diversity/std": 0.0883127897977829, + "rewards/symbolic_reward_accuracy/mean": 0.173828125, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.7343424558639526, + "rewards/symbolic_reward_partial_score/std": 0.213489830493927, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9438670873641968, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 65.19830322265625, + "step": 2665 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.20395781844854355, + "epoch": 4.272435897435898, + "grad_norm": 1100.1546630859375, + "learning_rate": 1e-06, + "loss": 0.2518, + "step": 2666 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.23760968446731567, + "epoch": 4.274038461538462, + "grad_norm": 0.022954348474740982, + "learning_rate": 1e-06, + "loss": 0.0944, + "step": 2667 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.22392556816339493, + "epoch": 4.2756410256410255, + "grad_norm": 0.028759047389030457, + "learning_rate": 1e-06, + "loss": 0.1897, + "step": 2668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.271484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4153.0, + "completions/mean_length": 5969.759765625, + "completions/mean_terminated_length": 2088.849853515625, + "completions/min_length": 986.0, + "completions/min_terminated_length": 986.0, + "entropy": 0.22678442299365997, + "epoch": 4.277243589743589, + "frac_reward_zero_std": 0.0625, + "grad_norm": 729.8278198242188, + "learning_rate": 1e-06, + "loss": 0.0574, + "num_tokens": 1582333732.0, + "reward": 0.2548202872276306, + "reward_std": 0.04770761728286743, + "rewards/progression_diversity/mean": -0.0428718663752079, + "rewards/progression_diversity/std": 0.07706419378519058, + "rewards/symbolic_reward_accuracy/mean": 0.107421875, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.6405435800552368, + "rewards/symbolic_reward_partial_score/std": 0.21080487966537476, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9503331780433655, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 61.90028381347656, + "step": 2669 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.20155585557222366, + "epoch": 4.278846153846154, + "grad_norm": 0.12458622455596924, + "learning_rate": 1e-06, + "loss": 0.253, + "step": 2670 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.21851865947246552, + "epoch": 4.280448717948718, + "grad_norm": 0.17406442761421204, + "learning_rate": 1e-06, + "loss": 0.1624, + "step": 2671 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2346644550561905, + "epoch": 4.282051282051282, + "grad_norm": 0.018945371732115746, + "learning_rate": 1e-06, + "loss": 0.1564, + "step": 2672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.23828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3899.0, + "completions/mean_length": 5541.857421875, + "completions/mean_terminated_length": 2150.212890625, + "completions/min_length": 1054.0, + "completions/min_terminated_length": 1054.0, + "entropy": 0.2370782345533371, + "epoch": 4.283653846153846, + "frac_reward_zero_std": 0.0, + "grad_norm": 549.9456787109375, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 1585986235.0, + "reward": 0.34320884943008423, + "reward_std": 0.08159251511096954, + "rewards/progression_diversity/mean": -0.03165600448846817, + "rewards/progression_diversity/std": 0.06647796928882599, + "rewards/symbolic_reward_accuracy/mean": 0.220703125, + "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, + "rewards/symbolic_reward_partial_score/mean": 0.7088867425918579, + "rewards/symbolic_reward_partial_score/std": 0.2290610820055008, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9550345540046692, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 60.594451904296875, + "step": 2673 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.21898599714040756, + "epoch": 4.285256410256411, + "grad_norm": 0.42992645502090454, + "learning_rate": 1e-06, + "loss": 0.2189, + "step": 2674 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.24925994873046875, + "epoch": 4.2868589743589745, + "grad_norm": 0.020100675523281097, + "learning_rate": 1e-06, + "loss": 0.0946, + "step": 2675 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.2393641099333763, + "epoch": 4.288461538461538, + "grad_norm": 0.01565798930823803, + "learning_rate": 1e-06, + "loss": 0.2389, + "step": 2676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4642.0, + "completions/mean_length": 5307.583984375, + "completions/mean_terminated_length": 2276.723876953125, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "entropy": 0.2343580722808838, + "epoch": 4.290064102564102, + "frac_reward_zero_std": 0.0, + "grad_norm": 1280.0311279296875, + "learning_rate": 1e-06, + "loss": 0.0584, + "num_tokens": 1589516614.0, + "reward": 0.3476376533508301, + "reward_std": 0.125152587890625, + "rewards/progression_diversity/mean": -0.03457668796181679, + "rewards/progression_diversity/std": 0.08347929269075394, + "rewards/symbolic_reward_accuracy/mean": 0.236328125, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.7035644054412842, + "rewards/symbolic_reward_partial_score/std": 0.2703304886817932, + "rewards/tag_count_reward/mean": -0.048828125, + "rewards/tag_count_reward/std": 0.2157193273305893, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9533030986785889, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 62.77830505371094, + "step": 2677 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.26719367504119873, + "epoch": 4.291666666666667, + "grad_norm": 0.26546433568000793, + "learning_rate": 1e-06, + "loss": 0.1209, + "step": 2678 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2331203892827034, + "epoch": 4.293269230769231, + "grad_norm": 0.01867266371846199, + "learning_rate": 1e-06, + "loss": 0.2064, + "step": 2679 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.23465164005756378, + "epoch": 4.294871794871795, + "grad_norm": 0.015481102280318737, + "learning_rate": 1e-06, + "loss": 0.1809, + "step": 2680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.2734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4966.0, + "completions/mean_length": 6191.328125, + "completions/mean_terminated_length": 2355.376220703125, + "completions/min_length": 918.0, + "completions/min_terminated_length": 918.0, + "entropy": 0.18589715659618378, + "epoch": 4.296474358974359, + "frac_reward_zero_std": 0.0, + "grad_norm": 1452.932373046875, + "learning_rate": 1e-06, + "loss": 0.1798, + "num_tokens": 1593491662.0, + "reward": 0.35412442684173584, + "reward_std": 0.1382516324520111, + "rewards/progression_diversity/mean": -0.046052753925323486, + "rewards/progression_diversity/std": 0.09779871255159378, + "rewards/symbolic_reward_accuracy/mean": 0.2578125, + "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, + "rewards/symbolic_reward_partial_score/mean": 0.6865071654319763, + "rewards/symbolic_reward_partial_score/std": 0.29961350560188293, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.935286283493042, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 69.72281646728516, + "step": 2681 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.21021146327257156, + "epoch": 4.298076923076923, + "grad_norm": 0.04195324704051018, + "learning_rate": 1e-06, + "loss": 0.1036, + "step": 2682 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.20793236792087555, + "epoch": 4.299679487179487, + "grad_norm": 0.15370796620845795, + "learning_rate": 1e-06, + "loss": 0.1581, + "step": 2683 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.19368456304073334, + "epoch": 4.301282051282051, + "grad_norm": 0.04016382619738579, + "learning_rate": 1e-06, + "loss": 0.2206, + "step": 2684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4625.0, + "completions/mean_length": 5578.798828125, + "completions/mean_terminated_length": 2271.084228515625, + "completions/min_length": 590.0, + "completions/min_terminated_length": 590.0, + "entropy": 0.21986397355794907, + "epoch": 4.302884615384615, + "frac_reward_zero_std": 0.0, + "grad_norm": 847.1763305664062, + "learning_rate": 1e-06, + "loss": 0.1179, + "num_tokens": 1597246327.0, + "reward": 0.2654598355293274, + "reward_std": 0.09309060871601105, + "rewards/progression_diversity/mean": -0.039955541491508484, + "rewards/progression_diversity/std": 0.08839154243469238, + "rewards/symbolic_reward_accuracy/mean": 0.134765625, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.6368489265441895, + "rewards/symbolic_reward_partial_score/std": 0.2666816711425781, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.944598376750946, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 68.18759155273438, + "step": 2685 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.20748194307088852, + "epoch": 4.30448717948718, + "grad_norm": 1317.310302734375, + "learning_rate": 1e-06, + "loss": 0.4452, + "step": 2686 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23975148051977158, + "epoch": 4.306089743589744, + "grad_norm": 0.02449853904545307, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 2687 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2513374388217926, + "epoch": 4.3076923076923075, + "grad_norm": 0.0537121407687664, + "learning_rate": 1e-06, + "loss": 0.1342, + "step": 2688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.173828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4082.0, + "completions/mean_length": 4746.345703125, + "completions/mean_terminated_length": 2297.76123046875, + "completions/min_length": 1121.0, + "completions/min_terminated_length": 1121.0, + "entropy": 0.264044925570488, + "epoch": 4.309294871794872, + "frac_reward_zero_std": 0.03125, + "grad_norm": 532.7546997070312, + "learning_rate": 1e-06, + "loss": 0.1721, + "num_tokens": 1600521912.0, + "reward": 0.3210293650627136, + "reward_std": 0.10824176669120789, + "rewards/progression_diversity/mean": -0.02890094369649887, + "rewards/progression_diversity/std": 0.08233704417943954, + "rewards/symbolic_reward_accuracy/mean": 0.19140625, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.7064778804779053, + "rewards/symbolic_reward_partial_score/std": 0.26733410358428955, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.965470552444458, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 58.45082473754883, + "step": 2689 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.390625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.27652691304683685, + "epoch": 4.310897435897436, + "grad_norm": 2549.2978515625, + "learning_rate": 1e-06, + "loss": 0.0809, + "step": 2690 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.23626501858234406, + "epoch": 4.3125, + "grad_norm": 25.48175048828125, + "learning_rate": 1e-06, + "loss": 0.1749, + "step": 2691 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.26965607702732086, + "epoch": 4.314102564102564, + "grad_norm": 21.068880081176758, + "learning_rate": 1e-06, + "loss": 0.158, + "step": 2692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.173828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4066.0, + "completions/mean_length": 4708.7734375, + "completions/mean_terminated_length": 2252.28369140625, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "entropy": 0.2753334045410156, + "epoch": 4.315705128205128, + "frac_reward_zero_std": 0.09375, + "grad_norm": 2231.534423828125, + "learning_rate": 1e-06, + "loss": 0.1243, + "num_tokens": 1603895476.0, + "reward": 0.3245422840118408, + "reward_std": 0.10428034514188766, + "rewards/progression_diversity/mean": -0.028682753443717957, + "rewards/progression_diversity/std": 0.07225596159696579, + "rewards/symbolic_reward_accuracy/mean": 0.21875, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.662841796875, + "rewards/symbolic_reward_partial_score/std": 0.27776244282722473, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9683549404144287, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 57.83184051513672, + "step": 2693 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2380198836326599, + "epoch": 4.3173076923076925, + "grad_norm": 908.7568969726562, + "learning_rate": 1e-06, + "loss": 0.1637, + "step": 2694 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2474098801612854, + "epoch": 4.318910256410256, + "grad_norm": 24.16575050354004, + "learning_rate": 1e-06, + "loss": 0.213, + "step": 2695 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.28730684518814087, + "epoch": 4.32051282051282, + "grad_norm": 0.17024962604045868, + "learning_rate": 1e-06, + "loss": 0.027, + "step": 2696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.17578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5872.0, + "completions/mean_length": 4746.83203125, + "completions/mean_terminated_length": 2264.9716796875, + "completions/min_length": 1135.0, + "completions/min_terminated_length": 1135.0, + "entropy": 0.25719259679317474, + "epoch": 4.322115384615385, + "frac_reward_zero_std": 0.0, + "grad_norm": 801.6245727539062, + "learning_rate": 1e-06, + "loss": 0.1019, + "num_tokens": 1607156350.0, + "reward": 0.30364054441452026, + "reward_std": 0.11523503810167313, + "rewards/progression_diversity/mean": -0.023154182359576225, + "rewards/progression_diversity/std": 0.07085532695055008, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.668505847454071, + "rewards/symbolic_reward_partial_score/std": 0.2690269947052002, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9642022848129272, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 60.5496826171875, + "step": 2697 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.264162078499794, + "epoch": 4.323717948717949, + "grad_norm": 7159.71484375, + "learning_rate": 1e-06, + "loss": 0.5042, + "step": 2698 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.2615188807249069, + "epoch": 4.325320512820513, + "grad_norm": 6776.10302734375, + "learning_rate": 1e-06, + "loss": 0.2205, + "step": 2699 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2697322964668274, + "epoch": 4.326923076923077, + "grad_norm": 0.10481588542461395, + "learning_rate": 1e-06, + "loss": 0.1195, + "step": 2700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.185546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3512.0, + "completions/mean_length": 4892.396484375, + "completions/mean_terminated_length": 2274.4052734375, + "completions/min_length": 1158.0, + "completions/min_terminated_length": 1158.0, + "entropy": 0.27855899930000305, + "epoch": 4.328525641025641, + "frac_reward_zero_std": 0.0, + "grad_norm": 877.5675659179688, + "learning_rate": 1e-06, + "loss": 0.0915, + "num_tokens": 1610557369.0, + "reward": 0.24516724050045013, + "reward_std": 0.09491077810525894, + "rewards/progression_diversity/mean": -0.03063952922821045, + "rewards/progression_diversity/std": 0.08986818790435791, + "rewards/symbolic_reward_accuracy/mean": 0.109375, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.6248860955238342, + "rewards/symbolic_reward_partial_score/std": 0.26446953415870667, + "rewards/tag_count_reward/mean": -0.076171875, + "rewards/tag_count_reward/std": 0.26553234457969666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9703221321105957, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 56.01524353027344, + "step": 2701 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2597070336341858, + "epoch": 4.330128205128205, + "grad_norm": 2198.372802734375, + "learning_rate": 1e-06, + "loss": 0.3091, + "step": 2702 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.26145724952220917, + "epoch": 4.331730769230769, + "grad_norm": 1281.1771240234375, + "learning_rate": 1e-06, + "loss": 0.1863, + "step": 2703 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2582560032606125, + "epoch": 4.333333333333333, + "grad_norm": 0.06832489371299744, + "learning_rate": 1e-06, + "loss": 0.2302, + "step": 2704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.13671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4234.0, + "completions/mean_length": 4208.84765625, + "completions/mean_terminated_length": 2280.65625, + "completions/min_length": 1218.0, + "completions/min_terminated_length": 1218.0, + "entropy": 0.2881985604763031, + "epoch": 4.334935897435898, + "frac_reward_zero_std": 0.03125, + "grad_norm": 2649.09423828125, + "learning_rate": 1e-06, + "loss": 0.2109, + "num_tokens": 1613462507.0, + "reward": 0.3500940799713135, + "reward_std": 0.11316157132387161, + "rewards/progression_diversity/mean": -0.019399691373109818, + "rewards/progression_diversity/std": 0.07821278274059296, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.6936686038970947, + "rewards/symbolic_reward_partial_score/std": 0.2801879942417145, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9930620193481445, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 43.674652099609375, + "step": 2705 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.28035198152065277, + "epoch": 4.336538461538462, + "grad_norm": 6.077508449554443, + "learning_rate": 1e-06, + "loss": 0.1302, + "step": 2706 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.29166266322135925, + "epoch": 4.3381410256410255, + "grad_norm": 0.6038011908531189, + "learning_rate": 1e-06, + "loss": 0.1481, + "step": 2707 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.28930486738681793, + "epoch": 4.339743589743589, + "grad_norm": 0.5783957839012146, + "learning_rate": 1e-06, + "loss": 0.1213, + "step": 2708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4111.0, + "completions/mean_length": 4497.861328125, + "completions/mean_terminated_length": 2296.724609375, + "completions/min_length": 1329.0, + "completions/min_terminated_length": 1329.0, + "entropy": 0.2855447679758072, + "epoch": 4.341346153846154, + "frac_reward_zero_std": 0.0, + "grad_norm": 770.3568725585938, + "learning_rate": 1e-06, + "loss": 0.1363, + "num_tokens": 1616717908.0, + "reward": 0.2790448069572449, + "reward_std": 0.11913672089576721, + "rewards/progression_diversity/mean": -0.018859868869185448, + "rewards/progression_diversity/std": 0.06757510453462601, + "rewards/symbolic_reward_accuracy/mean": 0.134765625, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.6872884035110474, + "rewards/symbolic_reward_partial_score/std": 0.2816750705242157, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9902298450469971, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 45.65616989135742, + "step": 2709 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.26931584626436234, + "epoch": 4.342948717948718, + "grad_norm": 456.98199462890625, + "learning_rate": 1e-06, + "loss": 0.2891, + "step": 2710 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2959611713886261, + "epoch": 4.344551282051282, + "grad_norm": 6.0887250900268555, + "learning_rate": 1e-06, + "loss": 0.7324, + "step": 2711 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2712084949016571, + "epoch": 4.346153846153846, + "grad_norm": 61.22941970825195, + "learning_rate": 1e-06, + "loss": 0.2727, + "step": 2712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4333.0, + "completions/mean_length": 4048.97265625, + "completions/mean_terminated_length": 2286.825927734375, + "completions/min_length": 857.0, + "completions/min_terminated_length": 857.0, + "entropy": 0.2950481027364731, + "epoch": 4.347756410256411, + "frac_reward_zero_std": 0.03125, + "grad_norm": 1410.4874267578125, + "learning_rate": 1e-06, + "loss": 0.1737, + "num_tokens": 1619610758.0, + "reward": 0.28769299387931824, + "reward_std": 0.09948226064443588, + "rewards/progression_diversity/mean": -0.012928030453622341, + "rewards/progression_diversity/std": 0.05409675091505051, + "rewards/symbolic_reward_accuracy/mean": 0.1640625, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.6508138179779053, + "rewards/symbolic_reward_partial_score/std": 0.267758846282959, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0013058185577393, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 40.96929931640625, + "step": 2713 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3074454814195633, + "epoch": 4.3493589743589745, + "grad_norm": 1392.9722900390625, + "learning_rate": 1e-06, + "loss": 0.1584, + "step": 2714 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.28932228684425354, + "epoch": 4.350961538461538, + "grad_norm": 12.400731086730957, + "learning_rate": 1e-06, + "loss": 0.5763, + "step": 2715 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.28287215530872345, + "epoch": 4.352564102564102, + "grad_norm": 488.6679382324219, + "learning_rate": 1e-06, + "loss": 0.2059, + "step": 2716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.142578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4276.0, + "completions/mean_length": 4305.140625, + "completions/mean_terminated_length": 2296.583251953125, + "completions/min_length": 714.0, + "completions/min_terminated_length": 714.0, + "entropy": 0.2647574245929718, + "epoch": 4.354166666666667, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1532.6412353515625, + "learning_rate": 1e-06, + "loss": 0.247, + "num_tokens": 1622678990.0, + "reward": 0.26650041341781616, + "reward_std": 0.10990884155035019, + "rewards/progression_diversity/mean": -0.019881153479218483, + "rewards/progression_diversity/std": 0.07622101157903671, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.6494140625, + "rewards/symbolic_reward_partial_score/std": 0.2761240601539612, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9932198524475098, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 43.36308288574219, + "step": 2717 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.30068182945251465, + "epoch": 4.355769230769231, + "grad_norm": 9.29902172088623, + "learning_rate": 1e-06, + "loss": 0.137, + "step": 2718 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2581791281700134, + "epoch": 4.357371794871795, + "grad_norm": 1740.7247314453125, + "learning_rate": 1e-06, + "loss": 0.235, + "step": 2719 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.29872237145900726, + "epoch": 4.358974358974359, + "grad_norm": 24.941686630249023, + "learning_rate": 1e-06, + "loss": 0.1233, + "step": 2720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.138671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5111.0, + "completions/mean_length": 4246.28515625, + "completions/mean_terminated_length": 2292.140625, + "completions/min_length": 1088.0, + "completions/min_terminated_length": 1088.0, + "entropy": 0.3010956943035126, + "epoch": 4.360576923076923, + "frac_reward_zero_std": 0.09375, + "grad_norm": 2546.631103515625, + "learning_rate": 1e-06, + "loss": 0.0579, + "num_tokens": 1625667760.0, + "reward": 0.2679121792316437, + "reward_std": 0.097173772752285, + "rewards/progression_diversity/mean": -0.019818993285298347, + "rewards/progression_diversity/std": 0.07718969136476517, + "rewards/symbolic_reward_accuracy/mean": 0.140625, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.6358886957168579, + "rewards/symbolic_reward_partial_score/std": 0.26983723044395447, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0032100677490234, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 38.43324279785156, + "step": 2721 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2788269519805908, + "epoch": 4.362179487179487, + "grad_norm": 0.07314433157444, + "learning_rate": 1e-06, + "loss": 0.1801, + "step": 2722 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2793916165828705, + "epoch": 4.363782051282051, + "grad_norm": 0.012875649146735668, + "learning_rate": 1e-06, + "loss": 0.179, + "step": 2723 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2723848670721054, + "epoch": 4.365384615384615, + "grad_norm": 0.053210750222206116, + "learning_rate": 1e-06, + "loss": 0.2149, + "step": 2724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.177734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5363.0, + "completions/mean_length": 4823.73046875, + "completions/mean_terminated_length": 2324.955078125, + "completions/min_length": 895.0, + "completions/min_terminated_length": 895.0, + "entropy": 0.26607632637023926, + "epoch": 4.36698717948718, + "frac_reward_zero_std": 0.03125, + "grad_norm": 536.0072631835938, + "learning_rate": 1e-06, + "loss": 0.1797, + "num_tokens": 1629215734.0, + "reward": 0.19067470729351044, + "reward_std": 0.1059282124042511, + "rewards/progression_diversity/mean": -0.025302495807409286, + "rewards/progression_diversity/std": 0.08640986680984497, + "rewards/symbolic_reward_accuracy/mean": 0.060546875, + "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, + "rewards/symbolic_reward_partial_score/mean": 0.5459309816360474, + "rewards/symbolic_reward_partial_score/std": 0.26790010929107666, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9844135046005249, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 45.332759857177734, + "step": 2725 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2631705701351166, + "epoch": 4.368589743589744, + "grad_norm": 1414.080810546875, + "learning_rate": 1e-06, + "loss": 0.3588, + "step": 2726 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.23933739215135574, + "epoch": 4.3701923076923075, + "grad_norm": 841.8580932617188, + "learning_rate": 1e-06, + "loss": 0.3975, + "step": 2727 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2838355749845505, + "epoch": 4.371794871794872, + "grad_norm": 46.018558502197266, + "learning_rate": 1e-06, + "loss": 0.133, + "step": 2728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.13671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5066.0, + "completions/mean_length": 4252.61328125, + "completions/mean_terminated_length": 2331.35302734375, + "completions/min_length": 1241.0, + "completions/min_terminated_length": 1241.0, + "entropy": 0.2777034640312195, + "epoch": 4.373397435897436, + "frac_reward_zero_std": 0.03125, + "grad_norm": 674.3890991210938, + "learning_rate": 1e-06, + "loss": 0.1737, + "num_tokens": 1632154656.0, + "reward": 0.3058478832244873, + "reward_std": 0.1194763332605362, + "rewards/progression_diversity/mean": -0.017753848806023598, + "rewards/progression_diversity/std": 0.07137567549943924, + "rewards/symbolic_reward_accuracy/mean": 0.197265625, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.6541991829872131, + "rewards/symbolic_reward_partial_score/std": 0.2857528626918793, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.001778483390808, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 38.360496520996094, + "step": 2729 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3052513748407364, + "epoch": 4.375, + "grad_norm": 153.75807189941406, + "learning_rate": 1e-06, + "loss": 0.0824, + "step": 2730 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2919474095106125, + "epoch": 4.376602564102564, + "grad_norm": 0.023059062659740448, + "learning_rate": 1e-06, + "loss": 0.2305, + "step": 2731 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.29401521384716034, + "epoch": 4.378205128205128, + "grad_norm": 0.014374534599483013, + "learning_rate": 1e-06, + "loss": 0.2279, + "step": 2732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.13671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4636.0, + "completions/mean_length": 4243.36328125, + "completions/mean_terminated_length": 2320.63818359375, + "completions/min_length": 1151.0, + "completions/min_terminated_length": 1151.0, + "entropy": 0.30612772703170776, + "epoch": 4.3798076923076925, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1426.8148193359375, + "learning_rate": 1e-06, + "loss": 0.1158, + "num_tokens": 1635162922.0, + "reward": 0.23149944841861725, + "reward_std": 0.09988817572593689, + "rewards/progression_diversity/mean": -0.018513914197683334, + "rewards/progression_diversity/std": 0.07170087099075317, + "rewards/symbolic_reward_accuracy/mean": 0.1015625, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.5978027582168579, + "rewards/symbolic_reward_partial_score/std": 0.2723853290081024, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0040245056152344, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 38.19234848022461, + "step": 2733 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.277388796210289, + "epoch": 4.381410256410256, + "grad_norm": 0.028441831469535828, + "learning_rate": 1e-06, + "loss": 0.2186, + "step": 2734 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.29677003622055054, + "epoch": 4.38301282051282, + "grad_norm": 0.014668729156255722, + "learning_rate": 1e-06, + "loss": 0.117, + "step": 2735 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.28144995868206024, + "epoch": 4.384615384615385, + "grad_norm": 0.014569677412509918, + "learning_rate": 1e-06, + "loss": 0.219, + "step": 2736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3800.0, + "completions/mean_length": 4181.333984375, + "completions/mean_terminated_length": 2344.07861328125, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "entropy": 0.277862548828125, + "epoch": 4.386217948717949, + "frac_reward_zero_std": 0.03125, + "grad_norm": 822.4926147460938, + "learning_rate": 1e-06, + "loss": 0.2072, + "num_tokens": 1638169845.0, + "reward": 0.3534255623817444, + "reward_std": 0.12979258596897125, + "rewards/progression_diversity/mean": -0.015356136485934258, + "rewards/progression_diversity/std": 0.0585295706987381, + "rewards/symbolic_reward_accuracy/mean": 0.255859375, + "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, + "rewards/symbolic_reward_partial_score/mean": 0.6857584714889526, + "rewards/symbolic_reward_partial_score/std": 0.2829958498477936, + "rewards/tag_count_reward/mean": -0.056640625, + "rewards/tag_count_reward/std": 0.23138070106506348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9974150657653809, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 42.795005798339844, + "step": 2737 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3079565465450287, + "epoch": 4.387820512820513, + "grad_norm": 3624.53125, + "learning_rate": 1e-06, + "loss": 0.4683, + "step": 2738 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2956803888082504, + "epoch": 4.389423076923077, + "grad_norm": 605.93310546875, + "learning_rate": 1e-06, + "loss": 0.216, + "step": 2739 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2977532297372818, + "epoch": 4.391025641025641, + "grad_norm": 0.01604938693344593, + "learning_rate": 1e-06, + "loss": 0.1553, + "step": 2740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.115234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3997.0, + "completions/mean_length": 3936.32421875, + "completions/mean_terminated_length": 2315.103759765625, + "completions/min_length": 616.0, + "completions/min_terminated_length": 616.0, + "entropy": 0.288834884762764, + "epoch": 4.392628205128205, + "frac_reward_zero_std": 0.09375, + "grad_norm": 13779.6708984375, + "learning_rate": 1e-06, + "loss": 0.2399, + "num_tokens": 1641167563.0, + "reward": 0.19894778728485107, + "reward_std": 0.08240419626235962, + "rewards/progression_diversity/mean": -0.016354724764823914, + "rewards/progression_diversity/std": 0.06330784410238266, + "rewards/symbolic_reward_accuracy/mean": 0.056640625, + "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, + "rewards/symbolic_reward_partial_score/mean": 0.5764648914337158, + "rewards/symbolic_reward_partial_score/std": 0.26688218116760254, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.010166883468628, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 37.03101348876953, + "step": 2741 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3093618303537369, + "epoch": 4.394230769230769, + "grad_norm": 40587.7109375, + "learning_rate": 1e-06, + "loss": 2.4703, + "step": 2742 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.31586118042469025, + "epoch": 4.395833333333333, + "grad_norm": 1233050.0, + "learning_rate": 1e-06, + "loss": 117.5929, + "step": 2743 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.29217708110809326, + "epoch": 4.397435897435898, + "grad_norm": 0.014232386834919453, + "learning_rate": 1e-06, + "loss": 0.1765, + "step": 2744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.107421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4179.0, + "completions/mean_length": 3859.9765625, + "completions/mean_terminated_length": 2352.708984375, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "entropy": 0.2802654653787613, + "epoch": 4.399038461538462, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1434.6417236328125, + "learning_rate": 1e-06, + "loss": 0.2646, + "num_tokens": 1644092767.0, + "reward": 0.2843579947948456, + "reward_std": 0.11489598453044891, + "rewards/progression_diversity/mean": -0.01488535013049841, + "rewards/progression_diversity/std": 0.06615662574768066, + "rewards/symbolic_reward_accuracy/mean": 0.166015625, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.6384602785110474, + "rewards/symbolic_reward_partial_score/std": 0.2716268301010132, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0159142017364502, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 34.84583282470703, + "step": 2745 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3136250674724579, + "epoch": 4.4006410256410255, + "grad_norm": 4538.53564453125, + "learning_rate": 1e-06, + "loss": 0.6472, + "step": 2746 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3199605792760849, + "epoch": 4.402243589743589, + "grad_norm": 3800.058349609375, + "learning_rate": 1e-06, + "loss": 0.4546, + "step": 2747 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.31357628107070923, + "epoch": 4.403846153846154, + "grad_norm": 11993.4921875, + "learning_rate": 1e-06, + "loss": 1.4673, + "step": 2748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.126953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4939.0, + "completions/mean_length": 4187.60546875, + "completions/mean_terminated_length": 2414.08056640625, + "completions/min_length": 1315.0, + "completions/min_terminated_length": 1315.0, + "entropy": 0.30130642652511597, + "epoch": 4.405448717948718, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1137.6595458984375, + "learning_rate": 1e-06, + "loss": 0.075, + "num_tokens": 1647127509.0, + "reward": 0.23044368624687195, + "reward_std": 0.11951728165149689, + "rewards/progression_diversity/mean": -0.023503467440605164, + "rewards/progression_diversity/std": 0.08980035781860352, + "rewards/symbolic_reward_accuracy/mean": 0.09765625, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.597705066204071, + "rewards/symbolic_reward_partial_score/std": 0.25887858867645264, + "rewards/tag_count_reward/mean": -0.072265625, + "rewards/tag_count_reward/std": 0.2591804563999176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0048857927322388, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 40.13702392578125, + "step": 2749 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.29961445927619934, + "epoch": 4.407051282051282, + "grad_norm": 0.05410754680633545, + "learning_rate": 1e-06, + "loss": 0.1237, + "step": 2750 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3096155524253845, + "epoch": 4.408653846153846, + "grad_norm": 0.017144974321126938, + "learning_rate": 1e-06, + "loss": 0.1487, + "step": 2751 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2930636405944824, + "epoch": 4.410256410256411, + "grad_norm": 0.6245795488357544, + "learning_rate": 1e-06, + "loss": 0.184, + "step": 2752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4303.0, + "completions/mean_length": 3234.2734375, + "completions/mean_terminated_length": 2328.342529296875, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "entropy": 0.3245136886835098, + "epoch": 4.4118589743589745, + "frac_reward_zero_std": 0.0625, + "grad_norm": 1545.5665283203125, + "learning_rate": 1e-06, + "loss": 0.1395, + "num_tokens": 1649663425.0, + "reward": 0.3059009313583374, + "reward_std": 0.08847295492887497, + "rewards/progression_diversity/mean": -0.014401247724890709, + "rewards/progression_diversity/std": 0.06953856348991394, + "rewards/symbolic_reward_accuracy/mean": 0.171875, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.6913737058639526, + "rewards/symbolic_reward_partial_score/std": 0.24967017769813538, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0390673875808716, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 25.484981536865234, + "step": 2753 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.31169527769088745, + "epoch": 4.413461538461538, + "grad_norm": 0.02673480287194252, + "learning_rate": 1e-06, + "loss": 0.1752, + "step": 2754 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3431106209754944, + "epoch": 4.415064102564102, + "grad_norm": 0.0156412310898304, + "learning_rate": 1e-06, + "loss": 0.0603, + "step": 2755 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.3379392623901367, + "epoch": 4.416666666666667, + "grad_norm": 0.03630439192056656, + "learning_rate": 1e-06, + "loss": 0.0386, + "step": 2756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4258.0, + "completions/mean_length": 2953.6015625, + "completions/mean_terminated_length": 2205.929931640625, + "completions/min_length": 1181.0, + "completions/min_terminated_length": 1181.0, + "entropy": 0.33529841899871826, + "epoch": 4.418269230769231, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1840.3985595703125, + "learning_rate": 1e-06, + "loss": 0.0718, + "num_tokens": 1652110821.0, + "reward": 0.27117669582366943, + "reward_std": 0.089139424264431, + "rewards/progression_diversity/mean": -0.014164266176521778, + "rewards/progression_diversity/std": 0.07661490887403488, + "rewards/symbolic_reward_accuracy/mean": 0.12109375, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.6706705689430237, + "rewards/symbolic_reward_partial_score/std": 0.23558108508586884, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.038825273513794, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 25.822206497192383, + "step": 2757 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3324403911828995, + "epoch": 4.419871794871795, + "grad_norm": 0.03022000938653946, + "learning_rate": 1e-06, + "loss": 0.0187, + "step": 2758 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.32936082780361176, + "epoch": 4.421474358974359, + "grad_norm": 0.026491805911064148, + "learning_rate": 1e-06, + "loss": 0.1138, + "step": 2759 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3215688169002533, + "epoch": 4.423076923076923, + "grad_norm": 0.019565416499972343, + "learning_rate": 1e-06, + "loss": 0.1158, + "step": 2760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4371.0, + "completions/mean_length": 2875.587890625, + "completions/mean_terminated_length": 2211.23974609375, + "completions/min_length": 1132.0, + "completions/min_terminated_length": 1132.0, + "entropy": 0.34050194919109344, + "epoch": 4.424679487179487, + "frac_reward_zero_std": 0.15625, + "grad_norm": 462.5588684082031, + "learning_rate": 1e-06, + "loss": 0.0736, + "num_tokens": 1654529730.0, + "reward": 0.3495715260505676, + "reward_std": 0.09474267810583115, + "rewards/progression_diversity/mean": -0.010134847834706306, + "rewards/progression_diversity/std": 0.061041031032800674, + "rewards/symbolic_reward_accuracy/mean": 0.22265625, + "rewards/symbolic_reward_accuracy/std": 0.41643625497817993, + "rewards/symbolic_reward_partial_score/mean": 0.7293782830238342, + "rewards/symbolic_reward_partial_score/std": 0.21722783148288727, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0413200855255127, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 24.018436431884766, + "step": 2761 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3244868367910385, + "epoch": 4.426282051282051, + "grad_norm": 0.03473278507590294, + "learning_rate": 1e-06, + "loss": 0.0975, + "step": 2762 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3279813975095749, + "epoch": 4.427884615384615, + "grad_norm": 0.02159891277551651, + "learning_rate": 1e-06, + "loss": 0.0595, + "step": 2763 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.33927109837532043, + "epoch": 4.42948717948718, + "grad_norm": 0.017984116449952126, + "learning_rate": 1e-06, + "loss": 0.0433, + "step": 2764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3670.0, + "completions/mean_length": 2952.734375, + "completions/mean_terminated_length": 2116.763671875, + "completions/min_length": 1072.0, + "completions/min_terminated_length": 1072.0, + "entropy": 0.3449130952358246, + "epoch": 4.431089743589744, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.01726151816546917, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 1656940938.0, + "reward": 0.3437863886356354, + "reward_std": 0.07033580541610718, + "rewards/progression_diversity/mean": -0.011987379752099514, + "rewards/progression_diversity/std": 0.059770114719867706, + "rewards/symbolic_reward_accuracy/mean": 0.21875, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.7179687023162842, + "rewards/symbolic_reward_partial_score/std": 0.22868217527866364, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.029953956604004, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 30.53244400024414, + "step": 2765 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.33045631647109985, + "epoch": 4.4326923076923075, + "grad_norm": 1069.1544189453125, + "learning_rate": 1e-06, + "loss": 0.1185, + "step": 2766 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3347640484571457, + "epoch": 4.434294871794872, + "grad_norm": 0.23746109008789062, + "learning_rate": 1e-06, + "loss": 0.0323, + "step": 2767 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.3243960440158844, + "epoch": 4.435897435897436, + "grad_norm": 0.02076563611626625, + "learning_rate": 1e-06, + "loss": 0.1661, + "step": 2768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3358.0, + "completions/mean_length": 3165.107421875, + "completions/mean_terminated_length": 2135.425048828125, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "entropy": 0.3194960504770279, + "epoch": 4.4375, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1015.4884643554688, + "learning_rate": 1e-06, + "loss": 0.1141, + "num_tokens": 1659435105.0, + "reward": 0.3932766914367676, + "reward_std": 0.10632849484682083, + "rewards/progression_diversity/mean": -0.015594206750392914, + "rewards/progression_diversity/std": 0.06812360882759094, + "rewards/symbolic_reward_accuracy/mean": 0.283203125, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.7548013925552368, + "rewards/symbolic_reward_partial_score/std": 0.23602750897407532, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0133789777755737, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 38.01353073120117, + "step": 2769 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3396587520837784, + "epoch": 4.439102564102564, + "grad_norm": 615.4129638671875, + "learning_rate": 1e-06, + "loss": 0.0409, + "step": 2770 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3090101033449173, + "epoch": 4.440705128205128, + "grad_norm": 1137.1129150390625, + "learning_rate": 1e-06, + "loss": 0.1933, + "step": 2771 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3358742445707321, + "epoch": 4.4423076923076925, + "grad_norm": 4.300602912902832, + "learning_rate": 1e-06, + "loss": 0.1023, + "step": 2772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3278.0, + "completions/mean_length": 3125.611328125, + "completions/mean_terminated_length": 2122.876220703125, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "entropy": 0.3427940756082535, + "epoch": 4.443910256410256, + "frac_reward_zero_std": 0.125, + "grad_norm": 407.4725341796875, + "learning_rate": 1e-06, + "loss": 0.0307, + "num_tokens": 1661810682.0, + "reward": 0.35953429341316223, + "reward_std": 0.1048397421836853, + "rewards/progression_diversity/mean": -0.011415429413318634, + "rewards/progression_diversity/std": 0.053802087903022766, + "rewards/symbolic_reward_accuracy/mean": 0.25, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.7059895396232605, + "rewards/symbolic_reward_partial_score/std": 0.25418156385421753, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.019977331161499, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 34.17755889892578, + "step": 2773 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3142138421535492, + "epoch": 4.44551282051282, + "grad_norm": 6694.47998046875, + "learning_rate": 1e-06, + "loss": 0.4636, + "step": 2774 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3414735943078995, + "epoch": 4.447115384615385, + "grad_norm": 879.877685546875, + "learning_rate": 1e-06, + "loss": 0.0436, + "step": 2775 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.330064132809639, + "epoch": 4.448717948717949, + "grad_norm": 0.05605228990316391, + "learning_rate": 1e-06, + "loss": 0.0995, + "step": 2776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4448.0, + "completions/mean_length": 3544.04296875, + "completions/mean_terminated_length": 2123.570556640625, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "entropy": 0.3042691648006439, + "epoch": 4.450320512820513, + "frac_reward_zero_std": 0.03125, + "grad_norm": 936.56103515625, + "learning_rate": 1e-06, + "loss": 0.1016, + "num_tokens": 1664474448.0, + "reward": 0.36131882667541504, + "reward_std": 0.11240831017494202, + "rewards/progression_diversity/mean": -0.013626575469970703, + "rewards/progression_diversity/std": 0.05417032539844513, + "rewards/symbolic_reward_accuracy/mean": 0.248046875, + "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, + "rewards/symbolic_reward_partial_score/mean": 0.7146159410476685, + "rewards/symbolic_reward_partial_score/std": 0.23360253870487213, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9984034299850464, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 43.34300994873047, + "step": 2777 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.31309716403484344, + "epoch": 4.451923076923077, + "grad_norm": 12874.2177734375, + "learning_rate": 1e-06, + "loss": 0.5435, + "step": 2778 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3106722980737686, + "epoch": 4.453525641025641, + "grad_norm": 0.4278506338596344, + "learning_rate": 1e-06, + "loss": 0.0563, + "step": 2779 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.307477205991745, + "epoch": 4.455128205128205, + "grad_norm": 0.03869732841849327, + "learning_rate": 1e-06, + "loss": 0.0845, + "step": 2780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3998.0, + "completions/mean_length": 2993.6328125, + "completions/mean_terminated_length": 2100.94189453125, + "completions/min_length": 1022.0, + "completions/min_terminated_length": 1022.0, + "entropy": 0.33481790125370026, + "epoch": 4.456730769230769, + "frac_reward_zero_std": 0.0625, + "grad_norm": 260.7884826660156, + "learning_rate": 1e-06, + "loss": 0.0412, + "num_tokens": 1666869284.0, + "reward": 0.3834180235862732, + "reward_std": 0.1236167624592781, + "rewards/progression_diversity/mean": -0.010737746953964233, + "rewards/progression_diversity/std": 0.04949859157204628, + "rewards/symbolic_reward_accuracy/mean": 0.267578125, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.7504231929779053, + "rewards/symbolic_reward_partial_score/std": 0.22556151449680328, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0300805568695068, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 27.265310287475586, + "step": 2781 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.31526750326156616, + "epoch": 4.458333333333333, + "grad_norm": 7211.5888671875, + "learning_rate": 1e-06, + "loss": 0.8517, + "step": 2782 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.329397588968277, + "epoch": 4.459935897435898, + "grad_norm": 52.46617126464844, + "learning_rate": 1e-06, + "loss": 0.0521, + "step": 2783 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.329983115196228, + "epoch": 4.461538461538462, + "grad_norm": 2546.868896484375, + "learning_rate": 1e-06, + "loss": 0.2009, + "step": 2784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3777.0, + "completions/mean_length": 2726.880859375, + "completions/mean_terminated_length": 2142.767822265625, + "completions/min_length": 1010.0, + "completions/min_terminated_length": 1010.0, + "entropy": 0.32919782400131226, + "epoch": 4.4631410256410255, + "frac_reward_zero_std": 0.15625, + "grad_norm": 794.0968017578125, + "learning_rate": 1e-06, + "loss": 0.0914, + "num_tokens": 1669120647.0, + "reward": 0.3136161267757416, + "reward_std": 0.07102187722921371, + "rewards/progression_diversity/mean": -0.006063465960323811, + "rewards/progression_diversity/std": 0.034248657524585724, + "rewards/symbolic_reward_accuracy/mean": 0.17578125, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.7024902105331421, + "rewards/symbolic_reward_partial_score/std": 0.2275681048631668, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.046392560005188, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 19.699295043945312, + "step": 2785 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3417544662952423, + "epoch": 4.464743589743589, + "grad_norm": 46041.80078125, + "learning_rate": 1e-06, + "loss": 0.8061, + "step": 2786 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.35066042840480804, + "epoch": 4.466346153846154, + "grad_norm": 0.02314118668437004, + "learning_rate": 1e-06, + "loss": 0.0297, + "step": 2787 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.34087491035461426, + "epoch": 4.467948717948718, + "grad_norm": 5573.73779296875, + "learning_rate": 1e-06, + "loss": 0.1796, + "step": 2788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3409.0, + "completions/mean_length": 2916.076171875, + "completions/mean_terminated_length": 2195.56982421875, + "completions/min_length": 1118.0, + "completions/min_terminated_length": 1118.0, + "entropy": 0.34819819033145905, + "epoch": 4.469551282051282, + "frac_reward_zero_std": 0.28125, + "grad_norm": 49.569732666015625, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 1671554350.0, + "reward": 0.4041130542755127, + "reward_std": 0.06181896850466728, + "rewards/progression_diversity/mean": -0.005198919214308262, + "rewards/progression_diversity/std": 0.02956199087202549, + "rewards/symbolic_reward_accuracy/mean": 0.3046875, + "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, + "rewards/symbolic_reward_partial_score/mean": 0.7482584714889526, + "rewards/symbolic_reward_partial_score/std": 0.23627658188343048, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.037619948387146, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 22.79637908935547, + "step": 2789 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3270893096923828, + "epoch": 4.471153846153846, + "grad_norm": 3314.6259765625, + "learning_rate": 1e-06, + "loss": 4.353, + "step": 2790 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.31063470244407654, + "epoch": 4.472756410256411, + "grad_norm": 5541.47216796875, + "learning_rate": 1e-06, + "loss": 0.4331, + "step": 2791 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3323570489883423, + "epoch": 4.4743589743589745, + "grad_norm": 0.013452508486807346, + "learning_rate": 1e-06, + "loss": 0.0806, + "step": 2792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4269.0, + "completions/mean_length": 2823.076171875, + "completions/mean_terminated_length": 2185.2412109375, + "completions/min_length": 677.0, + "completions/min_terminated_length": 677.0, + "entropy": 0.35660889744758606, + "epoch": 4.475961538461538, + "frac_reward_zero_std": 0.3125, + "grad_norm": 459.7804260253906, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 1673850565.0, + "reward": 0.3343041241168976, + "reward_std": 0.0574406236410141, + "rewards/progression_diversity/mean": -0.001719512976706028, + "rewards/progression_diversity/std": 0.017452143132686615, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.6970865726470947, + "rewards/symbolic_reward_partial_score/std": 0.22637033462524414, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0488998889923096, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 17.94207000732422, + "step": 2793 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.340995192527771, + "epoch": 4.477564102564102, + "grad_norm": 0.02110283449292183, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 2794 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.331098347902298, + "epoch": 4.479166666666667, + "grad_norm": 0.03017670288681984, + "learning_rate": 1e-06, + "loss": 0.0531, + "step": 2795 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.33193665742874146, + "epoch": 4.480769230769231, + "grad_norm": 0.07421034574508667, + "learning_rate": 1e-06, + "loss": 0.0802, + "step": 2796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4440.0, + "completions/mean_length": 2845.490234375, + "completions/mean_terminated_length": 2179.661865234375, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "entropy": 0.334924578666687, + "epoch": 4.482371794871795, + "frac_reward_zero_std": 0.375, + "grad_norm": 300.2858581542969, + "learning_rate": 1e-06, + "loss": 0.0756, + "num_tokens": 1676160880.0, + "reward": 0.3624279201030731, + "reward_std": 0.06554090976715088, + "rewards/progression_diversity/mean": -0.0008624562760815024, + "rewards/progression_diversity/std": 0.009423964656889439, + "rewards/symbolic_reward_accuracy/mean": 0.244140625, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.7289550304412842, + "rewards/symbolic_reward_partial_score/std": 0.23089763522148132, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0452535152435303, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 16.53045654296875, + "step": 2797 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3460453897714615, + "epoch": 4.483974358974359, + "grad_norm": 0.013277917169034481, + "learning_rate": 1e-06, + "loss": 0.0197, + "step": 2798 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3399341404438019, + "epoch": 4.485576923076923, + "grad_norm": 0.018666263669729233, + "learning_rate": 1e-06, + "loss": 0.078, + "step": 2799 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.32549184560775757, + "epoch": 4.487179487179487, + "grad_norm": 0.0136836227029562, + "learning_rate": 1e-06, + "loss": 0.1256, + "step": 2800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3747.0, + "completions/mean_length": 2939.9921875, + "completions/mean_terminated_length": 2162.239501953125, + "completions/min_length": 605.0, + "completions/min_terminated_length": 605.0, + "entropy": 0.3398710787296295, + "epoch": 4.488782051282051, + "frac_reward_zero_std": 0.34375, + "grad_norm": 727.4561157226562, + "learning_rate": 1e-06, + "loss": 0.0565, + "num_tokens": 1678543564.0, + "reward": 0.2560884952545166, + "reward_std": 0.05889653041958809, + "rewards/progression_diversity/mean": -3.998234387836419e-05, + "rewards/progression_diversity/std": 0.0009046972263604403, + "rewards/symbolic_reward_accuracy/mean": 0.115234375, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.6348795294761658, + "rewards/symbolic_reward_partial_score/std": 0.23259754478931427, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0552442073822021, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 7.984652519226074, + "step": 2801 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.33726924657821655, + "epoch": 4.490384615384615, + "grad_norm": 0.024799533188343048, + "learning_rate": 1e-06, + "loss": 0.0782, + "step": 2802 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3415806442499161, + "epoch": 4.49198717948718, + "grad_norm": 0.01905086636543274, + "learning_rate": 1e-06, + "loss": 0.05, + "step": 2803 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.32189974188804626, + "epoch": 4.493589743589744, + "grad_norm": 0.01600506529211998, + "learning_rate": 1e-06, + "loss": 0.1482, + "step": 2804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4592.0, + "completions/mean_length": 2884.654296875, + "completions/mean_terminated_length": 2191.66943359375, + "completions/min_length": 787.0, + "completions/min_terminated_length": 787.0, + "entropy": 0.33240219950675964, + "epoch": 4.4951923076923075, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1222.6678466796875, + "learning_rate": 1e-06, + "loss": 0.065, + "num_tokens": 1680954971.0, + "reward": 0.2994934320449829, + "reward_std": 0.05192206799983978, + "rewards/progression_diversity/mean": -0.00036605086643248796, + "rewards/progression_diversity/std": 0.004469391889870167, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.7053548097610474, + "rewards/symbolic_reward_partial_score/std": 0.2314625382423401, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0502550601959229, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 13.556268692016602, + "step": 2805 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.32200056314468384, + "epoch": 4.496794871794872, + "grad_norm": 0.03439924120903015, + "learning_rate": 1e-06, + "loss": 0.1092, + "step": 2806 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.34106314182281494, + "epoch": 4.498397435897436, + "grad_norm": 0.014802143909037113, + "learning_rate": 1e-06, + "loss": 0.0375, + "step": 2807 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.32713207602500916, + "epoch": 4.5, + "grad_norm": 0.030605483800172806, + "learning_rate": 1e-06, + "loss": 0.1174, + "step": 2808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4037.0, + "completions/mean_length": 2988.58203125, + "completions/mean_terminated_length": 2154.842529296875, + "completions/min_length": 983.0, + "completions/min_terminated_length": 983.0, + "entropy": 0.32437823712825775, + "epoch": 4.501602564102564, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1198.33984375, + "learning_rate": 1e-06, + "loss": 0.0856, + "num_tokens": 1683353045.0, + "reward": 0.3443542718887329, + "reward_std": 0.09359234571456909, + "rewards/progression_diversity/mean": -0.0006082241889089346, + "rewards/progression_diversity/std": 0.005995499901473522, + "rewards/symbolic_reward_accuracy/mean": 0.236328125, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.692138671875, + "rewards/symbolic_reward_partial_score/std": 0.2482290267944336, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045188069343567, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 16.6503849029541, + "step": 2809 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.33782175183296204, + "epoch": 4.503205128205128, + "grad_norm": 0.020003831014037132, + "learning_rate": 1e-06, + "loss": 0.0686, + "step": 2810 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3277200609445572, + "epoch": 4.5048076923076925, + "grad_norm": 0.016141105443239212, + "learning_rate": 1e-06, + "loss": 0.0929, + "step": 2811 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.32373303174972534, + "epoch": 4.506410256410256, + "grad_norm": 0.03574444353580475, + "learning_rate": 1e-06, + "loss": 0.1154, + "step": 2812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4322.0, + "completions/mean_length": 2614.474609375, + "completions/mean_terminated_length": 2112.7509765625, + "completions/min_length": 655.0, + "completions/min_terminated_length": 655.0, + "entropy": 0.3422088921070099, + "epoch": 4.50801282051282, + "frac_reward_zero_std": 0.375, + "grad_norm": 256.3056335449219, + "learning_rate": 1e-06, + "loss": 0.057, + "num_tokens": 1685530120.0, + "reward": 0.3261813223361969, + "reward_std": 0.06060642749071121, + "rewards/progression_diversity/mean": -0.0005208561196923256, + "rewards/progression_diversity/std": 0.005044504068791866, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.7187988758087158, + "rewards/symbolic_reward_partial_score/std": 0.2249433696269989, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0556640625, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 14.019671440124512, + "step": 2813 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.32429981231689453, + "epoch": 4.509615384615385, + "grad_norm": 0.016235308721661568, + "learning_rate": 1e-06, + "loss": 0.1237, + "step": 2814 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.34095147252082825, + "epoch": 4.511217948717949, + "grad_norm": 0.026136431843042374, + "learning_rate": 1e-06, + "loss": 0.0271, + "step": 2815 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.34518367052078247, + "epoch": 4.512820512820513, + "grad_norm": 0.012866949662566185, + "learning_rate": 1e-06, + "loss": 0.0377, + "step": 2816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3575.0, + "completions/mean_length": 2740.462890625, + "completions/mean_terminated_length": 2069.46923828125, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "entropy": 0.33681435883045197, + "epoch": 4.514423076923077, + "frac_reward_zero_std": 0.3125, + "grad_norm": 524.9415283203125, + "learning_rate": 1e-06, + "loss": 0.0291, + "num_tokens": 1687806261.0, + "reward": 0.4213888347148895, + "reward_std": 0.08116651326417923, + "rewards/progression_diversity/mean": -0.0007689363555982709, + "rewards/progression_diversity/std": 0.008490425534546375, + "rewards/symbolic_reward_accuracy/mean": 0.337890625, + "rewards/symbolic_reward_accuracy/std": 0.4734536409378052, + "rewards/symbolic_reward_partial_score/mean": 0.7386393547058105, + "rewards/symbolic_reward_partial_score/std": 0.24396677315235138, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0432395935058594, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 18.25098419189453, + "step": 2817 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3194080591201782, + "epoch": 4.516025641025641, + "grad_norm": 0.022730786353349686, + "learning_rate": 1e-06, + "loss": 0.1341, + "step": 2818 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3404367119073868, + "epoch": 4.517628205128205, + "grad_norm": 0.017860587686300278, + "learning_rate": 1e-06, + "loss": 0.0195, + "step": 2819 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.324150949716568, + "epoch": 4.519230769230769, + "grad_norm": 0.019054269418120384, + "learning_rate": 1e-06, + "loss": 0.0698, + "step": 2820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3389.0, + "completions/mean_length": 3260.208984375, + "completions/mean_terminated_length": 2026.348388671875, + "completions/min_length": 650.0, + "completions/min_terminated_length": 650.0, + "entropy": 0.3134499490261078, + "epoch": 4.520833333333333, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1172.327392578125, + "learning_rate": 1e-06, + "loss": 0.119, + "num_tokens": 1690392976.0, + "reward": 0.27747392654418945, + "reward_std": 0.07828624546527863, + "rewards/progression_diversity/mean": -0.0016331763472408056, + "rewards/progression_diversity/std": 0.009813708253204823, + "rewards/symbolic_reward_accuracy/mean": 0.140625, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.66455078125, + "rewards/symbolic_reward_partial_score/std": 0.2490902692079544, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0194971561431885, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 25.523792266845703, + "step": 2821 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.28017091751098633, + "epoch": 4.522435897435898, + "grad_norm": 0.024804720655083656, + "learning_rate": 1e-06, + "loss": 0.1724, + "step": 2822 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.32716619968414307, + "epoch": 4.524038461538462, + "grad_norm": 0.02028745971620083, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 2823 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.293626606464386, + "epoch": 4.5256410256410255, + "grad_norm": 0.01859315298497677, + "learning_rate": 1e-06, + "loss": 0.152, + "step": 2824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4031.0, + "completions/mean_length": 2742.6484375, + "completions/mean_terminated_length": 2012.8641357421875, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "entropy": 0.33004169166088104, + "epoch": 4.527243589743589, + "frac_reward_zero_std": 0.375, + "grad_norm": 179.26219177246094, + "learning_rate": 1e-06, + "loss": 0.0294, + "num_tokens": 1692678172.0, + "reward": 0.3905848264694214, + "reward_std": 0.056361980736255646, + "rewards/progression_diversity/mean": -0.00011124266166007146, + "rewards/progression_diversity/std": 0.002228989265859127, + "rewards/symbolic_reward_accuracy/mean": 0.298828125, + "rewards/symbolic_reward_accuracy/std": 0.45819199085235596, + "rewards/symbolic_reward_partial_score/mean": 0.716015636920929, + "rewards/symbolic_reward_partial_score/std": 0.2630784511566162, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041427493095398, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 19.09334945678711, + "step": 2825 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.310644194483757, + "epoch": 4.528846153846154, + "grad_norm": 0.01855999417603016, + "learning_rate": 1e-06, + "loss": 0.1514, + "step": 2826 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.33532649278640747, + "epoch": 4.530448717948718, + "grad_norm": 0.017645422369241714, + "learning_rate": 1e-06, + "loss": 0.0642, + "step": 2827 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3198052793741226, + "epoch": 4.532051282051282, + "grad_norm": 0.01399680133908987, + "learning_rate": 1e-06, + "loss": 0.0771, + "step": 2828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4372.0, + "completions/mean_length": 3640.142578125, + "completions/mean_terminated_length": 2012.0726318359375, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "entropy": 0.3003094792366028, + "epoch": 4.533653846153846, + "frac_reward_zero_std": 0.21875, + "grad_norm": 824.9033813476562, + "learning_rate": 1e-06, + "loss": 0.0933, + "num_tokens": 1695515621.0, + "reward": 0.29925772547721863, + "reward_std": 0.10430952906608582, + "rewards/progression_diversity/mean": -1.0345164810132701e-05, + "rewards/progression_diversity/std": 0.00019557196355890483, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.6570312976837158, + "rewards/symbolic_reward_partial_score/std": 0.28810763359069824, + "rewards/tag_count_reward/mean": -0.103515625, + "rewards/tag_count_reward/std": 0.30492907762527466, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0081380605697632, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 32.27723693847656, + "step": 2829 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2708180397748947, + "epoch": 4.535256410256411, + "grad_norm": 0.02845774032175541, + "learning_rate": 1e-06, + "loss": 0.1828, + "step": 2830 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.28906436264514923, + "epoch": 4.5368589743589745, + "grad_norm": 0.015066894702613354, + "learning_rate": 1e-06, + "loss": 0.1517, + "step": 2831 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2919054627418518, + "epoch": 4.538461538461538, + "grad_norm": 0.027522152289748192, + "learning_rate": 1e-06, + "loss": 0.1443, + "step": 2832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4253.0, + "completions/mean_length": 3101.431640625, + "completions/mean_terminated_length": 1975.790283203125, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "entropy": 0.32231228053569794, + "epoch": 4.540064102564102, + "frac_reward_zero_std": 0.3125, + "grad_norm": 864.8489990234375, + "learning_rate": 1e-06, + "loss": 0.0609, + "num_tokens": 1697946786.0, + "reward": 0.3166502118110657, + "reward_std": 0.07383254915475845, + "rewards/progression_diversity/mean": -1.783898551366292e-05, + "rewards/progression_diversity/std": 0.0004036501923110336, + "rewards/symbolic_reward_accuracy/mean": 0.19140625, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.6961263418197632, + "rewards/symbolic_reward_partial_score/std": 0.26130229234695435, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0254395008087158, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 25.37073516845703, + "step": 2833 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.29585976898670197, + "epoch": 4.541666666666667, + "grad_norm": 0.015048340894281864, + "learning_rate": 1e-06, + "loss": 0.1864, + "step": 2834 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.31169450283050537, + "epoch": 4.543269230769231, + "grad_norm": 0.013053800910711288, + "learning_rate": 1e-06, + "loss": 0.1106, + "step": 2835 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3154265433549881, + "epoch": 4.544871794871795, + "grad_norm": 0.02868885174393654, + "learning_rate": 1e-06, + "loss": 0.0882, + "step": 2836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4116.0, + "completions/mean_length": 2923.740234375, + "completions/mean_terminated_length": 1936.090087890625, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "entropy": 0.2929193377494812, + "epoch": 4.546474358974359, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1734.2320556640625, + "learning_rate": 1e-06, + "loss": 0.1872, + "num_tokens": 1700301149.0, + "reward": 0.2965981960296631, + "reward_std": 0.07857708632946014, + "rewards/progression_diversity/mean": -0.0008270339458249509, + "rewards/progression_diversity/std": 0.01284003909677267, + "rewards/symbolic_reward_accuracy/mean": 0.169921875, + "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, + "rewards/symbolic_reward_partial_score/mean": 0.669677734375, + "rewards/symbolic_reward_partial_score/std": 0.2715960741043091, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0229657888412476, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 27.50617790222168, + "step": 2837 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3156014531850815, + "epoch": 4.548076923076923, + "grad_norm": 0.010393503122031689, + "learning_rate": 1e-06, + "loss": 0.1186, + "step": 2838 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.32449236512184143, + "epoch": 4.549679487179487, + "grad_norm": 0.015459039248526096, + "learning_rate": 1e-06, + "loss": 0.0538, + "step": 2839 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.31201881170272827, + "epoch": 4.551282051282051, + "grad_norm": 0.014963453635573387, + "learning_rate": 1e-06, + "loss": 0.0831, + "step": 2840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3925.0, + "completions/mean_length": 3557.302734375, + "completions/mean_terminated_length": 1950.44189453125, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.30540113151073456, + "epoch": 4.552884615384615, + "frac_reward_zero_std": 0.09375, + "grad_norm": 626.2611083984375, + "learning_rate": 1e-06, + "loss": 0.077, + "num_tokens": 1703068392.0, + "reward": 0.24340294301509857, + "reward_std": 0.11287408322095871, + "rewards/progression_diversity/mean": -4.04381935368292e-05, + "rewards/progression_diversity/std": 0.0005320300115272403, + "rewards/symbolic_reward_accuracy/mean": 0.1171875, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.6095215082168579, + "rewards/symbolic_reward_partial_score/std": 0.27623260021209717, + "rewards/tag_count_reward/mean": -0.09765625, + "rewards/tag_count_reward/std": 0.29713961482048035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9910718202590942, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 40.45024108886719, + "step": 2841 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2842327356338501, + "epoch": 4.55448717948718, + "grad_norm": 0.02323756366968155, + "learning_rate": 1e-06, + "loss": 0.1819, + "step": 2842 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.28665095567703247, + "epoch": 4.556089743589744, + "grad_norm": 0.018670253455638885, + "learning_rate": 1e-06, + "loss": 0.1907, + "step": 2843 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.28130021691322327, + "epoch": 4.5576923076923075, + "grad_norm": 0.014929386787116528, + "learning_rate": 1e-06, + "loss": 0.1567, + "step": 2844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4265.0, + "completions/mean_length": 2879.638671875, + "completions/mean_terminated_length": 1888.7525634765625, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "entropy": 0.33259910345077515, + "epoch": 4.559294871794872, + "frac_reward_zero_std": 0.25, + "grad_norm": 539.2830810546875, + "learning_rate": 1e-06, + "loss": 0.0357, + "num_tokens": 1705342479.0, + "reward": 0.309814453125, + "reward_std": 0.08594641089439392, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.6902669668197632, + "rewards/symbolic_reward_partial_score/std": 0.2536517083644867, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0254346132278442, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 25.892318725585938, + "step": 2845 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3099818527698517, + "epoch": 4.560897435897436, + "grad_norm": 0.01687266118824482, + "learning_rate": 1e-06, + "loss": 0.1278, + "step": 2846 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3287712037563324, + "epoch": 4.5625, + "grad_norm": 0.017019418999552727, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 2847 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3039703518152237, + "epoch": 4.564102564102564, + "grad_norm": 0.02699456736445427, + "learning_rate": 1e-06, + "loss": 0.1679, + "step": 2848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3132.0, + "completions/mean_length": 3338.453125, + "completions/mean_terminated_length": 1957.8228759765625, + "completions/min_length": 890.0, + "completions/min_terminated_length": 890.0, + "entropy": 0.3064081221818924, + "epoch": 4.565705128205128, + "frac_reward_zero_std": 0.34375, + "grad_norm": 517.185302734375, + "learning_rate": 1e-06, + "loss": 0.109, + "num_tokens": 1707884951.0, + "reward": 0.35695183277130127, + "reward_std": 0.07652982324361801, + "rewards/progression_diversity/mean": -0.00013049774861428887, + "rewards/progression_diversity/std": 0.002952827140688896, + "rewards/symbolic_reward_accuracy/mean": 0.2421875, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.7328125238418579, + "rewards/symbolic_reward_partial_score/std": 0.2740279734134674, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0161539316177368, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 28.80113983154297, + "step": 2849 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3108035773038864, + "epoch": 4.5673076923076925, + "grad_norm": 4669.63525390625, + "learning_rate": 1e-06, + "loss": 0.3536, + "step": 2850 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2923634201288223, + "epoch": 4.568910256410256, + "grad_norm": 0.014963356778025627, + "learning_rate": 1e-06, + "loss": 0.1355, + "step": 2851 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3147512972354889, + "epoch": 4.57051282051282, + "grad_norm": 0.009707149118185043, + "learning_rate": 1e-06, + "loss": 0.1122, + "step": 2852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.134765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4114.0, + "completions/mean_length": 3972.37890625, + "completions/mean_terminated_length": 2039.19189453125, + "completions/min_length": 601.0, + "completions/min_terminated_length": 601.0, + "entropy": 0.2673351615667343, + "epoch": 4.572115384615385, + "frac_reward_zero_std": 0.15625, + "grad_norm": 499.8143005371094, + "learning_rate": 1e-06, + "loss": 0.1695, + "num_tokens": 1710790361.0, + "reward": 0.23631691932678223, + "reward_std": 0.09914463013410568, + "rewards/progression_diversity/mean": -0.0001452151482226327, + "rewards/progression_diversity/std": 0.00327769061550498, + "rewards/symbolic_reward_accuracy/mean": 0.103515625, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.6158528327941895, + "rewards/symbolic_reward_partial_score/std": 0.28579404950141907, + "rewards/tag_count_reward/mean": -0.10546875, + "rewards/tag_count_reward/std": 0.3074568510055542, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9861880540847778, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 42.465789794921875, + "step": 2853 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.301104798913002, + "epoch": 4.573717948717949, + "grad_norm": 0.01647542603313923, + "learning_rate": 1e-06, + "loss": 0.1265, + "step": 2854 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2893615812063217, + "epoch": 4.575320512820513, + "grad_norm": 0.02017975226044655, + "learning_rate": 1e-06, + "loss": 0.1213, + "step": 2855 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.26380933821201324, + "epoch": 4.576923076923077, + "grad_norm": 0.014531499706208706, + "learning_rate": 1e-06, + "loss": 0.2342, + "step": 2856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.14453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3445.0, + "completions/mean_length": 4089.650390625, + "completions/mean_terminated_length": 2012.522705078125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "entropy": 0.27838708460330963, + "epoch": 4.578525641025641, + "frac_reward_zero_std": 0.28125, + "grad_norm": 3233.53173828125, + "learning_rate": 1e-06, + "loss": 0.1156, + "num_tokens": 1713713190.0, + "reward": 0.2200063169002533, + "reward_std": 0.08358844369649887, + "rewards/progression_diversity/mean": -0.0008350086864084005, + "rewards/progression_diversity/std": 0.0046890368685126305, + "rewards/symbolic_reward_accuracy/mean": 0.087890625, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.5986166000366211, + "rewards/symbolic_reward_partial_score/std": 0.27839624881744385, + "rewards/tag_count_reward/mean": -0.123046875, + "rewards/tag_count_reward/std": 0.32881227135658264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900908470153809, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 42.56203842163086, + "step": 2857 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.28409095108509064, + "epoch": 4.580128205128205, + "grad_norm": 359.2823486328125, + "learning_rate": 1e-06, + "loss": 0.1902, + "step": 2858 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2833191752433777, + "epoch": 4.581730769230769, + "grad_norm": 0.015096834860742092, + "learning_rate": 1e-06, + "loss": 0.1434, + "step": 2859 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2946213483810425, + "epoch": 4.583333333333333, + "grad_norm": 0.01923026703298092, + "learning_rate": 1e-06, + "loss": 0.16, + "step": 2860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3878.0, + "completions/mean_length": 4379.064453125, + "completions/mean_terminated_length": 2022.95556640625, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "entropy": 0.282021164894104, + "epoch": 4.584935897435898, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1836.3408203125, + "learning_rate": 1e-06, + "loss": 0.1013, + "num_tokens": 1716882167.0, + "reward": 0.23553681373596191, + "reward_std": 0.10817793756723404, + "rewards/progression_diversity/mean": -0.0044245729222893715, + "rewards/progression_diversity/std": 0.01807117834687233, + "rewards/symbolic_reward_accuracy/mean": 0.103515625, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.6192545890808105, + "rewards/symbolic_reward_partial_score/std": 0.2875533699989319, + "rewards/tag_count_reward/mean": -0.123046875, + "rewards/tag_count_reward/std": 0.32881227135658264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978479266166687, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 46.914520263671875, + "step": 2861 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2763599753379822, + "epoch": 4.586538461538462, + "grad_norm": 0.9805912971496582, + "learning_rate": 1e-06, + "loss": 0.1614, + "step": 2862 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.27411825954914093, + "epoch": 4.5881410256410255, + "grad_norm": 0.016598911955952644, + "learning_rate": 1e-06, + "loss": 0.163, + "step": 2863 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.26957446336746216, + "epoch": 4.589743589743589, + "grad_norm": 0.02522032894194126, + "learning_rate": 1e-06, + "loss": 0.2078, + "step": 2864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.13671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3985.0, + "completions/mean_length": 3941.201171875, + "completions/mean_terminated_length": 1970.622314453125, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.2950494736433029, + "epoch": 4.591346153846154, + "frac_reward_zero_std": 0.25, + "grad_norm": 2163.189697265625, + "learning_rate": 1e-06, + "loss": 0.123, + "num_tokens": 1719742142.0, + "reward": 0.42619481682777405, + "reward_std": 0.12798863649368286, + "rewards/progression_diversity/mean": -0.001614722772501409, + "rewards/progression_diversity/std": 0.008756079711019993, + "rewards/symbolic_reward_accuracy/mean": 0.369140625, + "rewards/symbolic_reward_accuracy/std": 0.4830440282821655, + "rewards/symbolic_reward_partial_score/mean": 0.7169270515441895, + "rewards/symbolic_reward_partial_score/std": 0.3149210810661316, + "rewards/tag_count_reward/mean": -0.103515625, + "rewards/tag_count_reward/std": 0.30492907762527466, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912563562393188, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 41.91343307495117, + "step": 2865 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2725277543067932, + "epoch": 4.592948717948718, + "grad_norm": 5.392195224761963, + "learning_rate": 1e-06, + "loss": 0.1291, + "step": 2866 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.29304346442222595, + "epoch": 4.594551282051282, + "grad_norm": 0.035934969782829285, + "learning_rate": 1e-06, + "loss": 0.1513, + "step": 2867 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3078482747077942, + "epoch": 4.596153846153846, + "grad_norm": 0.01671292632818222, + "learning_rate": 1e-06, + "loss": 0.1524, + "step": 2868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3282.0, + "completions/mean_length": 3543.140625, + "completions/mean_terminated_length": 1934.505615234375, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "entropy": 0.3241666555404663, + "epoch": 4.597756410256411, + "frac_reward_zero_std": 0.09375, + "grad_norm": 679.676513671875, + "learning_rate": 1e-06, + "loss": 0.1035, + "num_tokens": 1722401942.0, + "reward": 0.302420049905777, + "reward_std": 0.12250546365976334, + "rewards/progression_diversity/mean": -0.0011605408508330584, + "rewards/progression_diversity/std": 0.009744822047650814, + "rewards/symbolic_reward_accuracy/mean": 0.189453125, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.656542956829071, + "rewards/symbolic_reward_partial_score/std": 0.2855222225189209, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899616241455078, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 41.70113754272461, + "step": 2869 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2940093129873276, + "epoch": 4.5993589743589745, + "grad_norm": 135809.875, + "learning_rate": 1e-06, + "loss": 7.7034, + "step": 2870 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3006378263235092, + "epoch": 4.600961538461538, + "grad_norm": 0.014255443587899208, + "learning_rate": 1e-06, + "loss": 0.122, + "step": 2871 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.30779320001602173, + "epoch": 4.602564102564102, + "grad_norm": 0.01406958419829607, + "learning_rate": 1e-06, + "loss": 0.1774, + "step": 2872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.138671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5307.0, + "completions/mean_length": 3947.236328125, + "completions/mean_terminated_length": 1944.945556640625, + "completions/min_length": 608.0, + "completions/min_terminated_length": 608.0, + "entropy": 0.29377947747707367, + "epoch": 4.604166666666667, + "frac_reward_zero_std": 0.125, + "grad_norm": 797.7410888671875, + "learning_rate": 1e-06, + "loss": 0.1212, + "num_tokens": 1725344047.0, + "reward": 0.32524576783180237, + "reward_std": 0.12811732292175293, + "rewards/progression_diversity/mean": -0.007164230570197105, + "rewards/progression_diversity/std": 0.03267419710755348, + "rewards/symbolic_reward_accuracy/mean": 0.2265625, + "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, + "rewards/symbolic_reward_partial_score/mean": 0.6670736074447632, + "rewards/symbolic_reward_partial_score/std": 0.28852909803390503, + "rewards/tag_count_reward/mean": -0.107421875, + "rewards/tag_count_reward/std": 0.30995169281959534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9761419892311096, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 46.77880859375, + "step": 2873 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.28681907057762146, + "epoch": 4.605769230769231, + "grad_norm": 0.03097737766802311, + "learning_rate": 1e-06, + "loss": 0.168, + "step": 2874 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2869716286659241, + "epoch": 4.607371794871795, + "grad_norm": 0.017294280230998993, + "learning_rate": 1e-06, + "loss": 0.1486, + "step": 2875 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.26380398869514465, + "epoch": 4.608974358974359, + "grad_norm": 0.011689919047057629, + "learning_rate": 1e-06, + "loss": 0.204, + "step": 2876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4096.0, + "completions/mean_length": 3391.904296875, + "completions/mean_terminated_length": 1954.6009521484375, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "entropy": 0.31273353099823, + "epoch": 4.610576923076923, + "frac_reward_zero_std": 0.21875, + "grad_norm": 2286.46435546875, + "learning_rate": 1e-06, + "loss": 0.144, + "num_tokens": 1727923182.0, + "reward": 0.346618115901947, + "reward_std": 0.1120685264468193, + "rewards/progression_diversity/mean": -0.011530693620443344, + "rewards/progression_diversity/std": 0.04677894711494446, + "rewards/symbolic_reward_accuracy/mean": 0.23828125, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.7052571773529053, + "rewards/symbolic_reward_partial_score/std": 0.2821883261203766, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.005671739578247, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 34.7049560546875, + "step": 2877 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3126651644706726, + "epoch": 4.612179487179487, + "grad_norm": 0.12625998258590698, + "learning_rate": 1e-06, + "loss": 0.0953, + "step": 2878 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.30597206950187683, + "epoch": 4.613782051282051, + "grad_norm": 0.013404784724116325, + "learning_rate": 1e-06, + "loss": 0.1176, + "step": 2879 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2983405888080597, + "epoch": 4.615384615384615, + "grad_norm": 0.012157008983194828, + "learning_rate": 1e-06, + "loss": 0.1615, + "step": 2880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.107421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3449.0, + "completions/mean_length": 3446.08203125, + "completions/mean_terminated_length": 1889.002197265625, + "completions/min_length": 558.0, + "completions/min_terminated_length": 558.0, + "entropy": 0.2969917505979538, + "epoch": 4.61698717948718, + "frac_reward_zero_std": 0.125, + "grad_norm": 808.1333618164062, + "learning_rate": 1e-06, + "loss": 0.1351, + "num_tokens": 1730537336.0, + "reward": 0.39082223176956177, + "reward_std": 0.11836196482181549, + "rewards/progression_diversity/mean": -0.011041311547160149, + "rewards/progression_diversity/std": 0.04268914833664894, + "rewards/symbolic_reward_accuracy/mean": 0.310546875, + "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, + "rewards/symbolic_reward_partial_score/mean": 0.7113118767738342, + "rewards/symbolic_reward_partial_score/std": 0.3017731308937073, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0123034715652466, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 34.338539123535156, + "step": 2881 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3034718632698059, + "epoch": 4.618589743589744, + "grad_norm": 0.022329121828079224, + "learning_rate": 1e-06, + "loss": 0.1786, + "step": 2882 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3132249712944031, + "epoch": 4.6201923076923075, + "grad_norm": 0.021133504807949066, + "learning_rate": 1e-06, + "loss": 0.1108, + "step": 2883 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.31891536712646484, + "epoch": 4.621794871794872, + "grad_norm": 0.016432074829936028, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 2884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3885.0, + "completions/mean_length": 2795.095703125, + "completions/mean_terminated_length": 1919.3035888671875, + "completions/min_length": 668.0, + "completions/min_terminated_length": 668.0, + "entropy": 0.3364175856113434, + "epoch": 4.623397435897436, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1410.4578857421875, + "learning_rate": 1e-06, + "loss": 0.0957, + "num_tokens": 1732852201.0, + "reward": 0.34519851207733154, + "reward_std": 0.0896041989326477, + "rewards/progression_diversity/mean": -0.0035866908729076385, + "rewards/progression_diversity/std": 0.021069582551717758, + "rewards/symbolic_reward_accuracy/mean": 0.22265625, + "rewards/symbolic_reward_accuracy/std": 0.41643625497817993, + "rewards/symbolic_reward_partial_score/mean": 0.7230468988418579, + "rewards/symbolic_reward_partial_score/std": 0.24687045812606812, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.028261423110962, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 28.295589447021484, + "step": 2885 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.32780230045318604, + "epoch": 4.625, + "grad_norm": 0.020152615383267403, + "learning_rate": 1e-06, + "loss": 0.0954, + "step": 2886 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3289777636528015, + "epoch": 4.626602564102564, + "grad_norm": 0.012928970158100128, + "learning_rate": 1e-06, + "loss": 0.0939, + "step": 2887 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.33029577136039734, + "epoch": 4.628205128205128, + "grad_norm": 0.013741690665483475, + "learning_rate": 1e-06, + "loss": 0.114, + "step": 2888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4060.0, + "completions/mean_length": 2811.73828125, + "completions/mean_terminated_length": 1966.9918212890625, + "completions/min_length": 586.0, + "completions/min_terminated_length": 586.0, + "entropy": 0.3604893386363983, + "epoch": 4.6298076923076925, + "frac_reward_zero_std": 0.25, + "grad_norm": 329.1974792480469, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 1735088787.0, + "reward": 0.31703460216522217, + "reward_std": 0.06931142508983612, + "rewards/progression_diversity/mean": -0.0021051131188869476, + "rewards/progression_diversity/std": 0.014493024908006191, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.7118000984191895, + "rewards/symbolic_reward_partial_score/std": 0.24182109534740448, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0364668369293213, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 24.83597183227539, + "step": 2889 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3358510881662369, + "epoch": 4.631410256410256, + "grad_norm": 0.026600396260619164, + "learning_rate": 1e-06, + "loss": 0.0782, + "step": 2890 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.34051838517189026, + "epoch": 4.63301282051282, + "grad_norm": 0.01583019644021988, + "learning_rate": 1e-06, + "loss": 0.077, + "step": 2891 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3082561045885086, + "epoch": 4.634615384615385, + "grad_norm": 0.013296614401042461, + "learning_rate": 1e-06, + "loss": 0.1433, + "step": 2892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3677.0, + "completions/mean_length": 3038.3359375, + "completions/mean_terminated_length": 2059.09423828125, + "completions/min_length": 736.0, + "completions/min_terminated_length": 736.0, + "entropy": 0.33847953379154205, + "epoch": 4.636217948717949, + "frac_reward_zero_std": 0.34375, + "grad_norm": 867.29736328125, + "learning_rate": 1e-06, + "loss": 0.0601, + "num_tokens": 1737559423.0, + "reward": 0.36477047204971313, + "reward_std": 0.08899857103824615, + "rewards/progression_diversity/mean": -0.001956797670572996, + "rewards/progression_diversity/std": 0.012748402543365955, + "rewards/symbolic_reward_accuracy/mean": 0.26171875, + "rewards/symbolic_reward_accuracy/std": 0.44000017642974854, + "rewards/symbolic_reward_partial_score/mean": 0.7127115726470947, + "rewards/symbolic_reward_partial_score/std": 0.27312228083610535, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.024355411529541, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 30.130250930786133, + "step": 2893 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3239692151546478, + "epoch": 4.637820512820513, + "grad_norm": 0.012062937021255493, + "learning_rate": 1e-06, + "loss": 0.11, + "step": 2894 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3260853737592697, + "epoch": 4.639423076923077, + "grad_norm": 0.11582206934690475, + "learning_rate": 1e-06, + "loss": 0.1047, + "step": 2895 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31543244421482086, + "epoch": 4.641025641025641, + "grad_norm": 0.020565569400787354, + "learning_rate": 1e-06, + "loss": 0.1456, + "step": 2896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3929.0, + "completions/mean_length": 2900.99609375, + "completions/mean_terminated_length": 2120.987548828125, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "entropy": 0.34089401364326477, + "epoch": 4.642628205128205, + "frac_reward_zero_std": 0.21875, + "grad_norm": 597.7639770507812, + "learning_rate": 1e-06, + "loss": 0.0498, + "num_tokens": 1739987869.0, + "reward": 0.27188873291015625, + "reward_std": 0.067967489361763, + "rewards/progression_diversity/mean": -9.193217556457967e-05, + "rewards/progression_diversity/std": 0.0012361678527668118, + "rewards/symbolic_reward_accuracy/mean": 0.115234375, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.689501941204071, + "rewards/symbolic_reward_partial_score/std": 0.22588656842708588, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0352983474731445, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 25.13549041748047, + "step": 2897 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3242097795009613, + "epoch": 4.644230769230769, + "grad_norm": 0.021813800558447838, + "learning_rate": 1e-06, + "loss": 0.1245, + "step": 2898 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.34044331312179565, + "epoch": 4.645833333333333, + "grad_norm": 0.016379984095692635, + "learning_rate": 1e-06, + "loss": 0.0982, + "step": 2899 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.33183909952640533, + "epoch": 4.647435897435898, + "grad_norm": 0.04149186238646507, + "learning_rate": 1e-06, + "loss": 3.0971, + "step": 2900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4292.0, + "completions/mean_length": 2847.33203125, + "completions/mean_terminated_length": 2064.218994140625, + "completions/min_length": 771.0, + "completions/min_terminated_length": 771.0, + "entropy": 0.32430991530418396, + "epoch": 4.649038461538462, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1954.2508544921875, + "learning_rate": 1e-06, + "loss": 0.1152, + "num_tokens": 1742326855.0, + "reward": 0.3111421465873718, + "reward_std": 0.07776374369859695, + "rewards/progression_diversity/mean": -4.3267915316391736e-05, + "rewards/progression_diversity/std": 0.0009790410986170173, + "rewards/symbolic_reward_accuracy/mean": 0.1875, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.6751627326011658, + "rewards/symbolic_reward_partial_score/std": 0.2522996664047241, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0431500673294067, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 21.82623863220215, + "step": 2901 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.3435054123401642, + "epoch": 4.6506410256410255, + "grad_norm": 0.013913768343627453, + "learning_rate": 1e-06, + "loss": 0.0597, + "step": 2902 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.34366682171821594, + "epoch": 4.652243589743589, + "grad_norm": 0.021470922976732254, + "learning_rate": 1e-06, + "loss": 0.0525, + "step": 2903 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3440418541431427, + "epoch": 4.653846153846154, + "grad_norm": 201.25079345703125, + "learning_rate": 1e-06, + "loss": 0.0948, + "step": 2904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3945.0, + "completions/mean_length": 2559.75390625, + "completions/mean_terminated_length": 2026.9736328125, + "completions/min_length": 664.0, + "completions/min_terminated_length": 664.0, + "entropy": 0.34741640090942383, + "epoch": 4.655448717948718, + "frac_reward_zero_std": 0.3125, + "grad_norm": 587.72802734375, + "learning_rate": 1e-06, + "loss": 0.0481, + "num_tokens": 1744472953.0, + "reward": 0.3951566517353058, + "reward_std": 0.06519210338592529, + "rewards/progression_diversity/mean": -0.0004501467919908464, + "rewards/progression_diversity/std": 0.005880733020603657, + "rewards/symbolic_reward_accuracy/mean": 0.283203125, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.7605631351470947, + "rewards/symbolic_reward_partial_score/std": 0.23610848188400269, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0574297904968262, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 15.714860916137695, + "step": 2905 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3476281762123108, + "epoch": 4.657051282051282, + "grad_norm": 0.025904379785060883, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 2906 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3408808410167694, + "epoch": 4.658653846153846, + "grad_norm": 0.02337014675140381, + "learning_rate": 1e-06, + "loss": 0.0792, + "step": 2907 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.33761781454086304, + "epoch": 4.660256410256411, + "grad_norm": 0.013979962095618248, + "learning_rate": 1e-06, + "loss": 0.0681, + "step": 2908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3958.0, + "completions/mean_length": 2804.318359375, + "completions/mean_terminated_length": 2018.716796875, + "completions/min_length": 513.0, + "completions/min_terminated_length": 513.0, + "entropy": 0.352565199136734, + "epoch": 4.6618589743589745, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.027550842612981796, + "learning_rate": 1e-06, + "loss": 0.0123, + "num_tokens": 1746766092.0, + "reward": 0.33431804180145264, + "reward_std": 0.07552418112754822, + "rewards/progression_diversity/mean": -0.0008154752431437373, + "rewards/progression_diversity/std": 0.007044443394988775, + "rewards/symbolic_reward_accuracy/mean": 0.201171875, + "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, + "rewards/symbolic_reward_partial_score/mean": 0.7244465947151184, + "rewards/symbolic_reward_partial_score/std": 0.24222518503665924, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0389173030853271, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 21.87183380126953, + "step": 2909 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3289457857608795, + "epoch": 4.663461538461538, + "grad_norm": 0.018005970865488052, + "learning_rate": 1e-06, + "loss": 0.1036, + "step": 2910 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.31602902710437775, + "epoch": 4.665064102564102, + "grad_norm": 0.015265545807778835, + "learning_rate": 1e-06, + "loss": 0.0927, + "step": 2911 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.32312436401844025, + "epoch": 4.666666666666667, + "grad_norm": 0.010948474518954754, + "learning_rate": 1e-06, + "loss": 0.1255, + "step": 2912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3403.0, + "completions/mean_length": 2489.4609375, + "completions/mean_terminated_length": 1953.9715576171875, + "completions/min_length": 735.0, + "completions/min_terminated_length": 735.0, + "entropy": 0.32907360792160034, + "epoch": 4.668269230769231, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1542.9832763671875, + "learning_rate": 1e-06, + "loss": 0.0546, + "num_tokens": 1748830296.0, + "reward": 0.3275552988052368, + "reward_std": 0.04779002070426941, + "rewards/progression_diversity/mean": -0.000332732277456671, + "rewards/progression_diversity/std": 0.004427206236869097, + "rewards/symbolic_reward_accuracy/mean": 0.1796875, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.7376953363418579, + "rewards/symbolic_reward_partial_score/std": 0.21765783429145813, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0540077686309814, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 14.962639808654785, + "step": 2913 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3347413241863251, + "epoch": 4.669871794871795, + "grad_norm": 0.02077837660908699, + "learning_rate": 1e-06, + "loss": 0.0577, + "step": 2914 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.35084839165210724, + "epoch": 4.671474358974359, + "grad_norm": 0.1329037845134735, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 2915 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3418993204832077, + "epoch": 4.673076923076923, + "grad_norm": 0.035508181899785995, + "learning_rate": 1e-06, + "loss": 0.0666, + "step": 2916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3737.0, + "completions/mean_length": 2839.4609375, + "completions/mean_terminated_length": 2026.227783203125, + "completions/min_length": 671.0, + "completions/min_terminated_length": 671.0, + "entropy": 0.3173777014017105, + "epoch": 4.674679487179487, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1100.101806640625, + "learning_rate": 1e-06, + "loss": 0.1087, + "num_tokens": 1751308340.0, + "reward": 0.3337549567222595, + "reward_std": 0.0646740198135376, + "rewards/progression_diversity/mean": -0.000972743786405772, + "rewards/progression_diversity/std": 0.006808849982917309, + "rewards/symbolic_reward_accuracy/mean": 0.21875, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.690673828125, + "rewards/symbolic_reward_partial_score/std": 0.2509380280971527, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.034561276435852, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 23.243541717529297, + "step": 2917 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3356782793998718, + "epoch": 4.676282051282051, + "grad_norm": 0.01395330298691988, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 2918 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.30980144441127777, + "epoch": 4.677884615384615, + "grad_norm": 0.015020109713077545, + "learning_rate": 1e-06, + "loss": 0.1587, + "step": 2919 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3269190788269043, + "epoch": 4.67948717948718, + "grad_norm": 0.10749267786741257, + "learning_rate": 1e-06, + "loss": 0.0676, + "step": 2920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3986.0, + "completions/mean_length": 2943.259765625, + "completions/mean_terminated_length": 2077.0166015625, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "entropy": 0.3407384008169174, + "epoch": 4.681089743589744, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1545.7191162109375, + "learning_rate": 1e-06, + "loss": 0.0606, + "num_tokens": 1753724633.0, + "reward": 0.3297162652015686, + "reward_std": 0.0669044628739357, + "rewards/progression_diversity/mean": -0.0015185039956122637, + "rewards/progression_diversity/std": 0.012093695811927319, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.6928547620773315, + "rewards/symbolic_reward_partial_score/std": 0.24812039732933044, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0323323011398315, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 27.282554626464844, + "step": 2921 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.34692129492759705, + "epoch": 4.6826923076923075, + "grad_norm": 0.01900269091129303, + "learning_rate": 1e-06, + "loss": 0.0044, + "step": 2922 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3168996274471283, + "epoch": 4.684294871794872, + "grad_norm": 0.011829662136733532, + "learning_rate": 1e-06, + "loss": 0.1236, + "step": 2923 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3172369599342346, + "epoch": 4.685897435897436, + "grad_norm": 0.026483498513698578, + "learning_rate": 1e-06, + "loss": 0.1768, + "step": 2924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3942.0, + "completions/mean_length": 2581.607421875, + "completions/mean_terminated_length": 2049.66943359375, + "completions/min_length": 836.0, + "completions/min_terminated_length": 836.0, + "entropy": 0.3523339778184891, + "epoch": 4.6875, + "frac_reward_zero_std": 0.28125, + "grad_norm": 861.6783447265625, + "learning_rate": 1e-06, + "loss": 0.0552, + "num_tokens": 1755836432.0, + "reward": 0.2985629141330719, + "reward_std": 0.05261341854929924, + "rewards/progression_diversity/mean": -0.0006439232383854687, + "rewards/progression_diversity/std": 0.005445914342999458, + "rewards/symbolic_reward_accuracy/mean": 0.1484375, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.7100748419761658, + "rewards/symbolic_reward_partial_score/std": 0.2193126529455185, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0577545166015625, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 16.59449577331543, + "step": 2925 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.34911976754665375, + "epoch": 4.689102564102564, + "grad_norm": 0.014677703380584717, + "learning_rate": 1e-06, + "loss": 0.0362, + "step": 2926 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.34325067698955536, + "epoch": 4.690705128205128, + "grad_norm": 0.016701413318514824, + "learning_rate": 1e-06, + "loss": 0.0602, + "step": 2927 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.34585435688495636, + "epoch": 4.6923076923076925, + "grad_norm": 0.026525946334004402, + "learning_rate": 1e-06, + "loss": 0.0583, + "step": 2928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3789.0, + "completions/mean_length": 2502.28515625, + "completions/mean_terminated_length": 2054.48779296875, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "entropy": 0.3411019742488861, + "epoch": 4.693910256410256, + "frac_reward_zero_std": 0.3125, + "grad_norm": 701.24560546875, + "learning_rate": 1e-06, + "loss": 0.1057, + "num_tokens": 1757960018.0, + "reward": 0.4438255727291107, + "reward_std": 0.05670292302966118, + "rewards/progression_diversity/mean": -0.0007465816452167928, + "rewards/progression_diversity/std": 0.006228178273886442, + "rewards/symbolic_reward_accuracy/mean": 0.361328125, + "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, + "rewards/symbolic_reward_partial_score/mean": 0.7639485597610474, + "rewards/symbolic_reward_partial_score/std": 0.24817843735218048, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0602136850357056, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 15.609932899475098, + "step": 2929 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3556542992591858, + "epoch": 4.69551282051282, + "grad_norm": 0.02133711613714695, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 2930 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3535788506269455, + "epoch": 4.697115384615385, + "grad_norm": 0.012668832205235958, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 2931 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.34661364555358887, + "epoch": 4.698717948717949, + "grad_norm": 0.013216467574238777, + "learning_rate": 1e-06, + "loss": 0.0117, + "step": 2932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3756.0, + "completions/mean_length": 2681.73046875, + "completions/mean_terminated_length": 2037.2474365234375, + "completions/min_length": 798.0, + "completions/min_terminated_length": 798.0, + "entropy": 0.3384409695863724, + "epoch": 4.700320512820513, + "frac_reward_zero_std": 0.25, + "grad_norm": 556.1314697265625, + "learning_rate": 1e-06, + "loss": 0.0886, + "num_tokens": 1760146200.0, + "reward": 0.3850386440753937, + "reward_std": 0.08345068991184235, + "rewards/progression_diversity/mean": -0.0010191474575549364, + "rewards/progression_diversity/std": 0.0071332212537527084, + "rewards/symbolic_reward_accuracy/mean": 0.279296875, + "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, + "rewards/symbolic_reward_partial_score/mean": 0.738574206829071, + "rewards/symbolic_reward_partial_score/std": 0.2675086259841919, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0498528480529785, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 20.236854553222656, + "step": 2933 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.33859123289585114, + "epoch": 4.701923076923077, + "grad_norm": 0.012142288498580456, + "learning_rate": 1e-06, + "loss": 0.0558, + "step": 2934 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.3401936739683151, + "epoch": 4.703525641025641, + "grad_norm": 0.01944906637072563, + "learning_rate": 1e-06, + "loss": 0.0776, + "step": 2935 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.34829023480415344, + "epoch": 4.705128205128205, + "grad_norm": 0.015569426119327545, + "learning_rate": 1e-06, + "loss": 0.0491, + "step": 2936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3820.0, + "completions/mean_length": 2710.529296875, + "completions/mean_terminated_length": 2038.0633544921875, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.3164035826921463, + "epoch": 4.706730769230769, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1386.5074462890625, + "learning_rate": 1e-06, + "loss": 0.1388, + "num_tokens": 1762370215.0, + "reward": 0.3807540237903595, + "reward_std": 0.08757130056619644, + "rewards/progression_diversity/mean": -0.00028213387122377753, + "rewards/progression_diversity/std": 0.002852478064596653, + "rewards/symbolic_reward_accuracy/mean": 0.26953125, + "rewards/symbolic_reward_accuracy/std": 0.44415023922920227, + "rewards/symbolic_reward_partial_score/mean": 0.7457519769668579, + "rewards/symbolic_reward_partial_score/std": 0.2586316764354706, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.044259786605835, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 21.89950942993164, + "step": 2937 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.35227370262145996, + "epoch": 4.708333333333333, + "grad_norm": 0.01955762878060341, + "learning_rate": 1e-06, + "loss": 0.0149, + "step": 2938 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3245883733034134, + "epoch": 4.709935897435898, + "grad_norm": 0.016230937093496323, + "learning_rate": 1e-06, + "loss": 0.1317, + "step": 2939 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.349644273519516, + "epoch": 4.711538461538462, + "grad_norm": 0.029934851452708244, + "learning_rate": 1e-06, + "loss": 0.0294, + "step": 2940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4126.0, + "completions/mean_length": 2700.505859375, + "completions/mean_terminated_length": 2056.90576171875, + "completions/min_length": 649.0, + "completions/min_terminated_length": 649.0, + "entropy": 0.3398941606283188, + "epoch": 4.7131410256410255, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1812.2177734375, + "learning_rate": 1e-06, + "loss": 0.0752, + "num_tokens": 1764614458.0, + "reward": 0.33322569727897644, + "reward_std": 0.07418441027402878, + "rewards/progression_diversity/mean": -0.00018588484090287238, + "rewards/progression_diversity/std": 0.003068706952035427, + "rewards/symbolic_reward_accuracy/mean": 0.2109375, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.7025552988052368, + "rewards/symbolic_reward_partial_score/std": 0.25797954201698303, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0462868213653564, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 20.06338882446289, + "step": 2941 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.34091413021087646, + "epoch": 4.714743589743589, + "grad_norm": 0.012703510001301765, + "learning_rate": 1e-06, + "loss": 0.0848, + "step": 2942 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3459191471338272, + "epoch": 4.716346153846154, + "grad_norm": 0.11944550275802612, + "learning_rate": 1e-06, + "loss": 0.0311, + "step": 2943 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.31565621495246887, + "epoch": 4.717948717948718, + "grad_norm": 0.027011489495635033, + "learning_rate": 1e-06, + "loss": 0.1477, + "step": 2944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5223.0, + "completions/mean_length": 2783.65234375, + "completions/mean_terminated_length": 2026.5196533203125, + "completions/min_length": 704.0, + "completions/min_terminated_length": 704.0, + "entropy": 0.3283272385597229, + "epoch": 4.719551282051282, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1712.640625, + "learning_rate": 1e-06, + "loss": 0.1465, + "num_tokens": 1766886824.0, + "reward": 0.4009649455547333, + "reward_std": 0.07681870460510254, + "rewards/progression_diversity/mean": -0.0001883438671939075, + "rewards/progression_diversity/std": 0.0030330359004437923, + "rewards/symbolic_reward_accuracy/mean": 0.30078125, + "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, + "rewards/symbolic_reward_partial_score/mean": 0.751269519329071, + "rewards/symbolic_reward_partial_score/std": 0.2660304307937622, + "rewards/tag_count_reward/mean": -0.048828125, + "rewards/tag_count_reward/std": 0.2157193273305893, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0471668243408203, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 20.307308197021484, + "step": 2945 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.34363169968128204, + "epoch": 4.721153846153846, + "grad_norm": 0.8427232503890991, + "learning_rate": 1e-06, + "loss": 0.0328, + "step": 2946 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3473658561706543, + "epoch": 4.722756410256411, + "grad_norm": 0.014580237679183483, + "learning_rate": 1e-06, + "loss": 0.0303, + "step": 2947 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.32401567697525024, + "epoch": 4.7243589743589745, + "grad_norm": 0.0191756933927536, + "learning_rate": 1e-06, + "loss": 0.1417, + "step": 2948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3855.0, + "completions/mean_length": 3248.55078125, + "completions/mean_terminated_length": 2044.234619140625, + "completions/min_length": 663.0, + "completions/min_terminated_length": 663.0, + "entropy": 0.31611892580986023, + "epoch": 4.725961538461538, + "frac_reward_zero_std": 0.125, + "grad_norm": 2376.0654296875, + "learning_rate": 1e-06, + "loss": 0.1231, + "num_tokens": 1769473858.0, + "reward": 0.37166327238082886, + "reward_std": 0.13059641420841217, + "rewards/progression_diversity/mean": -0.00017961920821107924, + "rewards/progression_diversity/std": 0.00354160089045763, + "rewards/symbolic_reward_accuracy/mean": 0.26953125, + "rewards/symbolic_reward_accuracy/std": 0.44415023922920227, + "rewards/symbolic_reward_partial_score/mean": 0.724560558795929, + "rewards/symbolic_reward_partial_score/std": 0.29390403628349304, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.021704912185669, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 30.869539260864258, + "step": 2949 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3012668788433075, + "epoch": 4.727564102564102, + "grad_norm": 0.017393101006746292, + "learning_rate": 1e-06, + "loss": 0.1819, + "step": 2950 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3096124082803726, + "epoch": 4.729166666666667, + "grad_norm": 0.22197610139846802, + "learning_rate": 1e-06, + "loss": 0.1091, + "step": 2951 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3276834934949875, + "epoch": 4.730769230769231, + "grad_norm": 0.0192260779440403, + "learning_rate": 1e-06, + "loss": 0.0842, + "step": 2952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4381.0, + "completions/mean_length": 2684.7734375, + "completions/mean_terminated_length": 2040.4334716796875, + "completions/min_length": 447.0, + "completions/min_terminated_length": 447.0, + "entropy": 0.3336394876241684, + "epoch": 4.732371794871795, + "frac_reward_zero_std": 0.3125, + "grad_norm": 2094.456298828125, + "learning_rate": 1e-06, + "loss": 0.0981, + "num_tokens": 1771661390.0, + "reward": 0.2928215265274048, + "reward_std": 0.06420518457889557, + "rewards/progression_diversity/mean": -7.55006549297832e-05, + "rewards/progression_diversity/std": 0.001374648418277502, + "rewards/symbolic_reward_accuracy/mean": 0.1484375, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.6941732168197632, + "rewards/symbolic_reward_partial_score/std": 0.24089013040065765, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0465376377105713, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 20.815563201904297, + "step": 2953 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3418269008398056, + "epoch": 4.733974358974359, + "grad_norm": 0.02304861880838871, + "learning_rate": 1e-06, + "loss": 0.0803, + "step": 2954 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3348529040813446, + "epoch": 4.735576923076923, + "grad_norm": 0.02030983939766884, + "learning_rate": 1e-06, + "loss": 0.0815, + "step": 2955 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3480427414178848, + "epoch": 4.737179487179487, + "grad_norm": 0.012364787049591541, + "learning_rate": 1e-06, + "loss": 0.0586, + "step": 2956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3864.0, + "completions/mean_length": 3018.962890625, + "completions/mean_terminated_length": 2068.311767578125, + "completions/min_length": 755.0, + "completions/min_terminated_length": 755.0, + "entropy": 0.30448389053344727, + "epoch": 4.738782051282051, + "frac_reward_zero_std": 0.0625, + "grad_norm": 8229.33984375, + "learning_rate": 1e-06, + "loss": 0.1366, + "num_tokens": 1774171099.0, + "reward": 0.29637160897254944, + "reward_std": 0.10101476311683655, + "rewards/progression_diversity/mean": -4.6070697862887755e-05, + "rewards/progression_diversity/std": 0.0010424609063193202, + "rewards/symbolic_reward_accuracy/mean": 0.154296875, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.6994954347610474, + "rewards/symbolic_reward_partial_score/std": 0.25719350576400757, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0173419713974, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 31.943614959716797, + "step": 2957 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3160858005285263, + "epoch": 4.740384615384615, + "grad_norm": 0.03362959623336792, + "learning_rate": 1e-06, + "loss": 0.1305, + "step": 2958 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3264904320240021, + "epoch": 4.74198717948718, + "grad_norm": 0.018001394346356392, + "learning_rate": 1e-06, + "loss": 0.06, + "step": 2959 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.30660150945186615, + "epoch": 4.743589743589744, + "grad_norm": 0.01249605417251587, + "learning_rate": 1e-06, + "loss": 0.144, + "step": 2960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3706.0, + "completions/mean_length": 2441.875, + "completions/mean_terminated_length": 1992.1290283203125, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.35260292887687683, + "epoch": 4.7451923076923075, + "frac_reward_zero_std": 0.375, + "grad_norm": 488.8838806152344, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 1776175595.0, + "reward": 0.36955156922340393, + "reward_std": 0.05210120975971222, + "rewards/progression_diversity/mean": -0.0004122033715248108, + "rewards/progression_diversity/std": 0.007343790493905544, + "rewards/symbolic_reward_accuracy/mean": 0.244140625, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.7539876103401184, + "rewards/symbolic_reward_partial_score/std": 0.2148759663105011, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0614118576049805, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 13.594598770141602, + "step": 2961 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.347807839512825, + "epoch": 4.746794871794872, + "grad_norm": 0.016412392258644104, + "learning_rate": 1e-06, + "loss": 0.0841, + "step": 2962 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.33857110142707825, + "epoch": 4.748397435897436, + "grad_norm": 0.016963588073849678, + "learning_rate": 1e-06, + "loss": 0.0678, + "step": 2963 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.34765180945396423, + "epoch": 4.75, + "grad_norm": 0.03191974386572838, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 2964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4131.0, + "completions/mean_length": 3280.611328125, + "completions/mean_terminated_length": 2079.234619140625, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.29712438583374023, + "epoch": 4.751602564102564, + "frac_reward_zero_std": 0.1875, + "grad_norm": 3667.5302734375, + "learning_rate": 1e-06, + "loss": 0.1519, + "num_tokens": 1778749860.0, + "reward": 0.3474825918674469, + "reward_std": 0.08697932213544846, + "rewards/progression_diversity/mean": -0.00027683598455041647, + "rewards/progression_diversity/std": 0.002760024508461356, + "rewards/symbolic_reward_accuracy/mean": 0.244140625, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.6973470449447632, + "rewards/symbolic_reward_partial_score/std": 0.2716548442840576, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0109891891479492, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 36.200584411621094, + "step": 2965 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3089023381471634, + "epoch": 4.753205128205128, + "grad_norm": 0.01566610299050808, + "learning_rate": 1e-06, + "loss": 0.2778, + "step": 2966 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2989957928657532, + "epoch": 4.7548076923076925, + "grad_norm": 0.03153858333826065, + "learning_rate": 1e-06, + "loss": 0.1536, + "step": 2967 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3228546977043152, + "epoch": 4.756410256410256, + "grad_norm": 0.017119932919740677, + "learning_rate": 1e-06, + "loss": 0.0581, + "step": 2968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3993.0, + "completions/mean_length": 3280.58203125, + "completions/mean_terminated_length": 2048.636962890625, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.303473562002182, + "epoch": 4.75801282051282, + "frac_reward_zero_std": 0.125, + "grad_norm": 2500.171142578125, + "learning_rate": 1e-06, + "loss": 0.1769, + "num_tokens": 1781321806.0, + "reward": 0.26970726251602173, + "reward_std": 0.08941252529621124, + "rewards/progression_diversity/mean": -0.0009572784183546901, + "rewards/progression_diversity/std": 0.009158154018223286, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.661425769329071, + "rewards/symbolic_reward_partial_score/std": 0.2673282027244568, + "rewards/tag_count_reward/mean": -0.083984375, + "rewards/tag_count_reward/std": 0.2776356339454651, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0137133598327637, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 36.107513427734375, + "step": 2969 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3026905506849289, + "epoch": 4.759615384615385, + "grad_norm": 0.021056868135929108, + "learning_rate": 1e-06, + "loss": 0.14, + "step": 2970 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.3188410997390747, + "epoch": 4.761217948717949, + "grad_norm": 0.01921817846596241, + "learning_rate": 1e-06, + "loss": 0.095, + "step": 2971 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.32021498680114746, + "epoch": 4.762820512820513, + "grad_norm": 0.04374043643474579, + "learning_rate": 1e-06, + "loss": 0.0584, + "step": 2972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3804.0, + "completions/mean_length": 3198.404296875, + "completions/mean_terminated_length": 2050.61376953125, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "entropy": 0.3287845551967621, + "epoch": 4.764423076923077, + "frac_reward_zero_std": 0.28125, + "grad_norm": 576.1661987304688, + "learning_rate": 1e-06, + "loss": 0.0627, + "num_tokens": 1783830109.0, + "reward": 0.3199334740638733, + "reward_std": 0.08054739236831665, + "rewards/progression_diversity/mean": -0.0003103530907537788, + "rewards/progression_diversity/std": 0.003829639870673418, + "rewards/symbolic_reward_accuracy/mean": 0.193359375, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.7044758796691895, + "rewards/symbolic_reward_partial_score/std": 0.27706295251846313, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.010635495185852, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 784.0, + "sampling/sampling_logp_difference/mean": 36.60023498535156, + "step": 2973 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3081291615962982, + "epoch": 4.766025641025641, + "grad_norm": 0.018835294991731644, + "learning_rate": 1e-06, + "loss": 0.1503, + "step": 2974 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.29927410185337067, + "epoch": 4.767628205128205, + "grad_norm": 0.013380290009081364, + "learning_rate": 1e-06, + "loss": 0.1841, + "step": 2975 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.322013258934021, + "epoch": 4.769230769230769, + "grad_norm": 0.016294099390506744, + "learning_rate": 1e-06, + "loss": 0.09, + "step": 2976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4037.0, + "completions/mean_length": 3144.052734375, + "completions/mean_terminated_length": 2142.71240234375, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.3358401507139206, + "epoch": 4.770833333333333, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1361.25830078125, + "learning_rate": 1e-06, + "loss": 0.0311, + "num_tokens": 1786338952.0, + "reward": 0.34467262029647827, + "reward_std": 0.06627324223518372, + "rewards/progression_diversity/mean": -2.456928814353887e-05, + "rewards/progression_diversity/std": 0.000555939506739378, + "rewards/symbolic_reward_accuracy/mean": 0.24609375, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.6788574457168579, + "rewards/symbolic_reward_partial_score/std": 0.2802160680294037, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0176117420196533, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 780.0, + "sampling/sampling_logp_difference/mean": 32.03617858886719, + "step": 2977 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.31088291108608246, + "epoch": 4.772435897435898, + "grad_norm": 0.012580040842294693, + "learning_rate": 1e-06, + "loss": 0.1263, + "step": 2978 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.31900927424430847, + "epoch": 4.774038461538462, + "grad_norm": 0.012707662768661976, + "learning_rate": 1e-06, + "loss": 0.0779, + "step": 2979 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.29433473944664, + "epoch": 4.7756410256410255, + "grad_norm": 0.01724996045231819, + "learning_rate": 1e-06, + "loss": 0.2277, + "step": 2980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.115234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3774.0, + "completions/mean_length": 3808.806640625, + "completions/mean_terminated_length": 2170.97802734375, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.28534041345119476, + "epoch": 4.777243589743589, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1845.77587890625, + "learning_rate": 1e-06, + "loss": 0.1926, + "num_tokens": 1789274645.0, + "reward": 0.25106510519981384, + "reward_std": 0.11057647317647934, + "rewards/progression_diversity/mean": -0.0009115600259974599, + "rewards/progression_diversity/std": 0.0063006458804011345, + "rewards/symbolic_reward_accuracy/mean": 0.130859375, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.6110025644302368, + "rewards/symbolic_reward_partial_score/std": 0.29573264718055725, + "rewards/tag_count_reward/mean": -0.107421875, + "rewards/tag_count_reward/std": 0.30995169281959534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9930451512336731, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 40.782203674316406, + "step": 2981 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.303585946559906, + "epoch": 4.778846153846154, + "grad_norm": 0.012852794490754604, + "learning_rate": 1e-06, + "loss": 0.1133, + "step": 2982 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2768773287534714, + "epoch": 4.780448717948718, + "grad_norm": 0.10220827162265778, + "learning_rate": 1e-06, + "loss": 0.2116, + "step": 2983 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.30345627665519714, + "epoch": 4.782051282051282, + "grad_norm": 0.06796098500490189, + "learning_rate": 1e-06, + "loss": 0.104, + "step": 2984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3753.0, + "completions/mean_length": 3173.6015625, + "completions/mean_terminated_length": 2174.495849609375, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.30603721737861633, + "epoch": 4.783653846153846, + "frac_reward_zero_std": 0.1875, + "grad_norm": 5978.078125, + "learning_rate": 1e-06, + "loss": 0.1597, + "num_tokens": 1791752489.0, + "reward": 0.3074996769428253, + "reward_std": 0.0968872681260109, + "rewards/progression_diversity/mean": -0.0014983013970777392, + "rewards/progression_diversity/std": 0.009809834882616997, + "rewards/symbolic_reward_accuracy/mean": 0.19140625, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.6656738519668579, + "rewards/symbolic_reward_partial_score/std": 0.2579022943973541, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0267293453216553, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 26.516414642333984, + "step": 2985 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.32077351212501526, + "epoch": 4.785256410256411, + "grad_norm": 0.03152855858206749, + "learning_rate": 1e-06, + "loss": 0.1114, + "step": 2986 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3265487253665924, + "epoch": 4.7868589743589745, + "grad_norm": 5331029908783104.0, + "learning_rate": 1e-06, + "loss": 734216716288.0, + "step": 2987 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.31940317153930664, + "epoch": 4.788461538461538, + "grad_norm": 18845.392578125, + "learning_rate": 1e-06, + "loss": 5.7057, + "step": 2988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4206.0, + "completions/mean_length": 3355.01953125, + "completions/mean_terminated_length": 2160.46484375, + "completions/min_length": 952.0, + "completions/min_terminated_length": 952.0, + "entropy": 0.3106093406677246, + "epoch": 4.790064102564102, + "frac_reward_zero_std": 0.25, + "grad_norm": 1507.441650390625, + "learning_rate": 1e-06, + "loss": 0.097, + "num_tokens": 1794362163.0, + "reward": 0.36152663826942444, + "reward_std": 0.09895937889814377, + "rewards/progression_diversity/mean": -0.0021223740186542273, + "rewards/progression_diversity/std": 0.013677747920155525, + "rewards/symbolic_reward_accuracy/mean": 0.2578125, + "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, + "rewards/symbolic_reward_partial_score/mean": 0.7129719853401184, + "rewards/symbolic_reward_partial_score/std": 0.2863211929798126, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0154645442962646, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 764.0, + "sampling/sampling_logp_difference/mean": 28.80075454711914, + "step": 2989 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.28619883954524994, + "epoch": 4.791666666666667, + "grad_norm": 0.02123548462986946, + "learning_rate": 1e-06, + "loss": 0.1906, + "step": 2990 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.29719385504722595, + "epoch": 4.793269230769231, + "grad_norm": 0.01841905526816845, + "learning_rate": 1e-06, + "loss": 0.1245, + "step": 2991 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3073057234287262, + "epoch": 4.794871794871795, + "grad_norm": 0.01792910508811474, + "learning_rate": 1e-06, + "loss": 0.0712, + "step": 2992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.13671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4189.0, + "completions/mean_length": 3995.111328125, + "completions/mean_terminated_length": 2033.0701904296875, + "completions/min_length": 405.0, + "completions/min_terminated_length": 405.0, + "entropy": 0.2574385032057762, + "epoch": 4.796474358974359, + "frac_reward_zero_std": 0.1875, + "grad_norm": 2505.857666015625, + "learning_rate": 1e-06, + "loss": 0.197, + "num_tokens": 1797306380.0, + "reward": 0.26426079869270325, + "reward_std": 0.08823113143444061, + "rewards/progression_diversity/mean": -0.000679816585034132, + "rewards/progression_diversity/std": 0.004317981190979481, + "rewards/symbolic_reward_accuracy/mean": 0.15234375, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.6165690422058105, + "rewards/symbolic_reward_partial_score/std": 0.3016470968723297, + "rewards/tag_count_reward/mean": -0.12109375, + "rewards/tag_count_reward/std": 0.3265552520751953, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9852645397186279, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 33.74715042114258, + "step": 2993 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2707083225250244, + "epoch": 4.798076923076923, + "grad_norm": 0.09528608620166779, + "learning_rate": 1e-06, + "loss": 0.13, + "step": 2994 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2617756277322769, + "epoch": 4.799679487179487, + "grad_norm": 0.01799207553267479, + "learning_rate": 1e-06, + "loss": 0.1663, + "step": 2995 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.28101493418216705, + "epoch": 4.801282051282051, + "grad_norm": 0.01640203595161438, + "learning_rate": 1e-06, + "loss": 0.153, + "step": 2996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3428.0, + "completions/mean_length": 3477.884765625, + "completions/mean_terminated_length": 2018.9324951171875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.3022962361574173, + "epoch": 4.802884615384615, + "frac_reward_zero_std": 0.25, + "grad_norm": 2163.138427734375, + "learning_rate": 1e-06, + "loss": 0.096, + "num_tokens": 1799923057.0, + "reward": 0.2933644652366638, + "reward_std": 0.09190071374177933, + "rewards/progression_diversity/mean": -0.00046986271627247334, + "rewards/progression_diversity/std": 0.004312288947403431, + "rewards/symbolic_reward_accuracy/mean": 0.162109375, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.6836262941360474, + "rewards/symbolic_reward_partial_score/std": 0.27113112807273865, + "rewards/tag_count_reward/mean": -0.08984375, + "rewards/tag_count_reward/std": 0.2862374484539032, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0017528533935547, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 760.0, + "sampling/sampling_logp_difference/mean": 26.413158416748047, + "step": 2997 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2857065796852112, + "epoch": 4.80448717948718, + "grad_norm": 0.01477570179849863, + "learning_rate": 1e-06, + "loss": 0.2033, + "step": 2998 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2832331210374832, + "epoch": 4.806089743589744, + "grad_norm": 0.016135964542627335, + "learning_rate": 1e-06, + "loss": 0.9264, + "step": 2999 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30287647247314453, + "epoch": 4.8076923076923075, + "grad_norm": 3408249.25, + "learning_rate": 1e-06, + "loss": 1910.0106, + "step": 3000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4039.0, + "completions/mean_length": 3437.560546875, + "completions/mean_terminated_length": 2005.30810546875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.31259530782699585, + "epoch": 4.809294871794872, + "frac_reward_zero_std": 0.21875, + "grad_norm": 722.9392700195312, + "learning_rate": 1e-06, + "loss": 0.0731, + "num_tokens": 1802489712.0, + "reward": 0.3412483334541321, + "reward_std": 0.09203283488750458, + "rewards/progression_diversity/mean": -0.00016795920964796096, + "rewards/progression_diversity/std": 0.0027168295346200466, + "rewards/symbolic_reward_accuracy/mean": 0.234375, + "rewards/symbolic_reward_accuracy/std": 0.42402184009552, + "rewards/symbolic_reward_partial_score/mean": 0.7013020515441895, + "rewards/symbolic_reward_partial_score/std": 0.2958465814590454, + "rewards/tag_count_reward/mean": -0.09765625, + "rewards/tag_count_reward/std": 0.29713961482048035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0116255283355713, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 22.6229248046875, + "step": 3001 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3023006319999695, + "epoch": 4.810897435897436, + "grad_norm": 0.01664135977625847, + "learning_rate": 1e-06, + "loss": 0.1274, + "step": 3002 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.30224159359931946, + "epoch": 4.8125, + "grad_norm": 0.016737397760152817, + "learning_rate": 1e-06, + "loss": 0.0899, + "step": 3003 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2703643888235092, + "epoch": 4.814102564102564, + "grad_norm": 0.019325420260429382, + "learning_rate": 1e-06, + "loss": 0.2522, + "step": 3004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3899.0, + "completions/mean_length": 3158.623046875, + "completions/mean_terminated_length": 2007.3695068359375, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "entropy": 0.29125888645648956, + "epoch": 4.815705128205128, + "frac_reward_zero_std": 0.25, + "grad_norm": 1334.0791015625, + "learning_rate": 1e-06, + "loss": 0.1156, + "num_tokens": 1804920927.0, + "reward": 0.4101855754852295, + "reward_std": 0.08342595398426056, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.328125, + "rewards/symbolic_reward_accuracy/std": 0.4699897766113281, + "rewards/symbolic_reward_partial_score/mean": 0.7468424439430237, + "rewards/symbolic_reward_partial_score/std": 0.2777378261089325, + "rewards/tag_count_reward/mean": -0.107421875, + "rewards/tag_count_reward/std": 0.30995169281959534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0229686498641968, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 15.124759674072266, + "step": 3005 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2940514087677002, + "epoch": 4.8173076923076925, + "grad_norm": 0.030098246410489082, + "learning_rate": 1e-06, + "loss": 0.1046, + "step": 3006 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.29679183661937714, + "epoch": 4.818910256410256, + "grad_norm": 0.13558229804039001, + "learning_rate": 1e-06, + "loss": 0.0689, + "step": 3007 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.28803931176662445, + "epoch": 4.82051282051282, + "grad_norm": 0.028529301285743713, + "learning_rate": 1e-06, + "loss": 0.0629, + "step": 3008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.107421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3583.0, + "completions/mean_length": 3604.537109375, + "completions/mean_terminated_length": 2066.52734375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.2692013829946518, + "epoch": 4.822115384615385, + "frac_reward_zero_std": 0.125, + "grad_norm": 2515.8955078125, + "learning_rate": 1e-06, + "loss": 0.1519, + "num_tokens": 1807674082.0, + "reward": 0.2645507752895355, + "reward_std": 0.10678978264331818, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.138671875, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.6370442509651184, + "rewards/symbolic_reward_partial_score/std": 0.28384286165237427, + "rewards/tag_count_reward/mean": -0.09765625, + "rewards/tag_count_reward/std": 0.29713961482048035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.983115553855896, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 29.70915985107422, + "step": 3009 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.26541532576084137, + "epoch": 4.823717948717949, + "grad_norm": 0.19816499948501587, + "learning_rate": 1e-06, + "loss": 0.1234, + "step": 3010 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.2563439905643463, + "epoch": 4.825320512820513, + "grad_norm": 0.014676299877464771, + "learning_rate": 1e-06, + "loss": 0.152, + "step": 3011 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.27064676582813263, + "epoch": 4.826923076923077, + "grad_norm": 0.022342152893543243, + "learning_rate": 1e-06, + "loss": 0.1299, + "step": 3012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4520.0, + "completions/mean_length": 3052.28125, + "completions/mean_terminated_length": 2074.062744140625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.2788449823856354, + "epoch": 4.828525641025641, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1405.5775146484375, + "learning_rate": 1e-06, + "loss": 0.07, + "num_tokens": 1810095938.0, + "reward": 0.3012896776199341, + "reward_std": 0.07198358327150345, + "rewards/progression_diversity/mean": -0.0009162867208942771, + "rewards/progression_diversity/std": 0.006650157272815704, + "rewards/symbolic_reward_accuracy/mean": 0.177734375, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.6703450679779053, + "rewards/symbolic_reward_partial_score/std": 0.2553132474422455, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0105712413787842, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 764.0, + "sampling/sampling_logp_difference/mean": 21.2810001373291, + "step": 3013 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.258184090256691, + "epoch": 4.830128205128205, + "grad_norm": 0.03618193790316582, + "learning_rate": 1e-06, + "loss": 0.184, + "step": 3014 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2832891196012497, + "epoch": 4.831730769230769, + "grad_norm": 0.017958376556634903, + "learning_rate": 1e-06, + "loss": 0.0317, + "step": 3015 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2775871604681015, + "epoch": 4.833333333333333, + "grad_norm": 0.02259541116654873, + "learning_rate": 1e-06, + "loss": 0.0932, + "step": 3016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.107421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3844.0, + "completions/mean_length": 3652.623046875, + "completions/mean_terminated_length": 2120.400390625, + "completions/min_length": 610.0, + "completions/min_terminated_length": 610.0, + "entropy": 0.25543099641799927, + "epoch": 4.834935897435898, + "frac_reward_zero_std": 0.25, + "grad_norm": 3206.30419921875, + "learning_rate": 1e-06, + "loss": 0.1315, + "num_tokens": 1812886417.0, + "reward": 0.26881104707717896, + "reward_std": 0.09635767340660095, + "rewards/progression_diversity/mean": -0.0021946923807263374, + "rewards/progression_diversity/std": 0.014525186270475388, + "rewards/symbolic_reward_accuracy/mean": 0.1328125, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.6617349982261658, + "rewards/symbolic_reward_partial_score/std": 0.27741891145706177, + "rewards/tag_count_reward/mean": -0.09375, + "rewards/tag_count_reward/std": 0.29176566004753113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9844847321510315, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 35.239051818847656, + "step": 3017 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2537519484758377, + "epoch": 4.836538461538462, + "grad_norm": 0.014091641642153263, + "learning_rate": 1e-06, + "loss": 0.1088, + "step": 3018 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2543463036417961, + "epoch": 4.8381410256410255, + "grad_norm": 0.02137858420610428, + "learning_rate": 1e-06, + "loss": 0.148, + "step": 3019 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.25083983689546585, + "epoch": 4.839743589743589, + "grad_norm": 0.021724727004766464, + "learning_rate": 1e-06, + "loss": 0.1358, + "step": 3020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3564.0, + "completions/mean_length": 3640.005859375, + "completions/mean_terminated_length": 2043.505615234375, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "entropy": 0.25516054779291153, + "epoch": 4.841346153846154, + "frac_reward_zero_std": 0.28125, + "grad_norm": 753.2295532226562, + "learning_rate": 1e-06, + "loss": 0.0884, + "num_tokens": 1815684788.0, + "reward": 0.29512912034988403, + "reward_std": 0.08512111753225327, + "rewards/progression_diversity/mean": -0.0027145203202962875, + "rewards/progression_diversity/std": 0.0170114878565073, + "rewards/symbolic_reward_accuracy/mean": 0.171875, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.6752604246139526, + "rewards/symbolic_reward_partial_score/std": 0.29889386892318726, + "rewards/tag_count_reward/mean": -0.10546875, + "rewards/tag_count_reward/std": 0.3074568510055542, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.986849308013916, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 32.57376480102539, + "step": 3021 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.23778339475393295, + "epoch": 4.842948717948718, + "grad_norm": 0.759289562702179, + "learning_rate": 1e-06, + "loss": 0.2053, + "step": 3022 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.25857195258140564, + "epoch": 4.844551282051282, + "grad_norm": 0.02268078364431858, + "learning_rate": 1e-06, + "loss": 0.0626, + "step": 3023 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.24314356595277786, + "epoch": 4.846153846153846, + "grad_norm": 0.016103658825159073, + "learning_rate": 1e-06, + "loss": 0.1707, + "step": 3024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4237.0, + "completions/mean_length": 3289.69921875, + "completions/mean_terminated_length": 2027.935791015625, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "entropy": 0.27912090718746185, + "epoch": 4.847756410256411, + "frac_reward_zero_std": 0.40625, + "grad_norm": 1176.0621337890625, + "learning_rate": 1e-06, + "loss": 0.0734, + "num_tokens": 1818186474.0, + "reward": 0.2887709140777588, + "reward_std": 0.047475919127464294, + "rewards/progression_diversity/mean": -0.0008389006252400577, + "rewards/progression_diversity/std": 0.008314850740134716, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.6767903566360474, + "rewards/symbolic_reward_partial_score/std": 0.2791590392589569, + "rewards/tag_count_reward/mean": -0.080078125, + "rewards/tag_count_reward/std": 0.271679550409317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0138845443725586, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 768.0, + "sampling/sampling_logp_difference/mean": 23.615236282348633, + "step": 3025 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.28166961669921875, + "epoch": 4.8493589743589745, + "grad_norm": 0.01939082331955433, + "learning_rate": 1e-06, + "loss": 0.0351, + "step": 3026 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.25974585115909576, + "epoch": 4.850961538461538, + "grad_norm": 0.029129792004823685, + "learning_rate": 1e-06, + "loss": 0.1673, + "step": 3027 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.27130499482154846, + "epoch": 4.852564102564102, + "grad_norm": 0.09306855499744415, + "learning_rate": 1e-06, + "loss": 0.0714, + "step": 3028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3573.0, + "completions/mean_length": 3610.380859375, + "completions/mean_terminated_length": 2041.6907958984375, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.2426222562789917, + "epoch": 4.854166666666667, + "frac_reward_zero_std": 0.25, + "grad_norm": 1595.66259765625, + "learning_rate": 1e-06, + "loss": 0.126, + "num_tokens": 1820930301.0, + "reward": 0.30804911255836487, + "reward_std": 0.07695259153842926, + "rewards/progression_diversity/mean": -0.0027075535617768764, + "rewards/progression_diversity/std": 0.02146134339272976, + "rewards/symbolic_reward_accuracy/mean": 0.20703125, + "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, + "rewards/symbolic_reward_partial_score/mean": 0.6467121839523315, + "rewards/symbolic_reward_partial_score/std": 0.3115384578704834, + "rewards/tag_count_reward/mean": -0.1015625, + "rewards/tag_count_reward/std": 0.30236753821372986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9908959269523621, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 32.86491012573242, + "step": 3029 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.24747144430875778, + "epoch": 4.855769230769231, + "grad_norm": 0.017448777332901955, + "learning_rate": 1e-06, + "loss": 0.1274, + "step": 3030 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.26713868975639343, + "epoch": 4.857371794871795, + "grad_norm": 0.018266018480062485, + "learning_rate": 1e-06, + "loss": 0.0513, + "step": 3031 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23238349705934525, + "epoch": 4.858974358974359, + "grad_norm": 0.013434065505862236, + "learning_rate": 1e-06, + "loss": 0.2029, + "step": 3032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3829.0, + "completions/mean_length": 3513.24609375, + "completions/mean_terminated_length": 1995.733642578125, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "entropy": 0.2464209794998169, + "epoch": 4.860576923076923, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1448.568115234375, + "learning_rate": 1e-06, + "loss": 0.1083, + "num_tokens": 1823646795.0, + "reward": 0.3104338049888611, + "reward_std": 0.06594209372997284, + "rewards/progression_diversity/mean": -0.003005102276802063, + "rewards/progression_diversity/std": 0.022076178342103958, + "rewards/symbolic_reward_accuracy/mean": 0.18359375, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.698291003704071, + "rewards/symbolic_reward_partial_score/std": 0.2843267321586609, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879831075668335, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 34.731117248535156, + "step": 3033 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.23648971319198608, + "epoch": 4.862179487179487, + "grad_norm": 0.027491990476846695, + "learning_rate": 1e-06, + "loss": 0.1345, + "step": 3034 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23159796744585037, + "epoch": 4.863782051282051, + "grad_norm": 0.27813470363616943, + "learning_rate": 1e-06, + "loss": 0.1389, + "step": 3035 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.23856277763843536, + "epoch": 4.865384615384615, + "grad_norm": 0.04474368691444397, + "learning_rate": 1e-06, + "loss": 0.0929, + "step": 3036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3863.0, + "completions/mean_length": 3395.388671875, + "completions/mean_terminated_length": 1927.11083984375, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "entropy": 0.24479106068611145, + "epoch": 4.86698717948718, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1553.0164794921875, + "learning_rate": 1e-06, + "loss": 0.1017, + "num_tokens": 1826229362.0, + "reward": 0.31477978825569153, + "reward_std": 0.07595334947109222, + "rewards/progression_diversity/mean": -0.0034673186019062996, + "rewards/progression_diversity/std": 0.023447884246706963, + "rewards/symbolic_reward_accuracy/mean": 0.208984375, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.6639648079872131, + "rewards/symbolic_reward_partial_score/std": 0.2900123596191406, + "rewards/tag_count_reward/mean": -0.09765625, + "rewards/tag_count_reward/std": 0.29713961482048035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9932220578193665, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 33.091861724853516, + "step": 3037 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.24616575241088867, + "epoch": 4.868589743589744, + "grad_norm": 0.10199563205242157, + "learning_rate": 1e-06, + "loss": 0.0912, + "step": 3038 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.23579637706279755, + "epoch": 4.8701923076923075, + "grad_norm": 0.0868377834558487, + "learning_rate": 1e-06, + "loss": 0.1658, + "step": 3039 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.23798397183418274, + "epoch": 4.871794871794872, + "grad_norm": 0.019305624067783356, + "learning_rate": 1e-06, + "loss": 0.1112, + "step": 3040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3809.0, + "completions/mean_length": 3376.041015625, + "completions/mean_terminated_length": 1968.2532958984375, + "completions/min_length": 368.0, + "completions/min_terminated_length": 368.0, + "entropy": 0.2433418408036232, + "epoch": 4.873397435897436, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1277.455810546875, + "learning_rate": 1e-06, + "loss": 0.0845, + "num_tokens": 1828798391.0, + "reward": 0.32777589559555054, + "reward_std": 0.09310010075569153, + "rewards/progression_diversity/mean": -0.002196947578340769, + "rewards/progression_diversity/std": 0.019314348697662354, + "rewards/symbolic_reward_accuracy/mean": 0.224609375, + "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, + "rewards/symbolic_reward_partial_score/mean": 0.675341784954071, + "rewards/symbolic_reward_partial_score/std": 0.3014916479587555, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9986719489097595, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 29.44231605529785, + "step": 3041 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.25057369470596313, + "epoch": 4.875, + "grad_norm": 0.013811684213578701, + "learning_rate": 1e-06, + "loss": 0.0995, + "step": 3042 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.24305298924446106, + "epoch": 4.876602564102564, + "grad_norm": 0.014420563355088234, + "learning_rate": 1e-06, + "loss": 0.1266, + "step": 3043 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.24530383944511414, + "epoch": 4.878205128205128, + "grad_norm": 0.03273453935980797, + "learning_rate": 1e-06, + "loss": 0.0984, + "step": 3044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3740.0, + "completions/mean_length": 2730.166015625, + "completions/mean_terminated_length": 1940.274658203125, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.24772102385759354, + "epoch": 4.8798076923076925, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1146.00244140625, + "learning_rate": 1e-06, + "loss": 0.1173, + "num_tokens": 1830977612.0, + "reward": 0.41831594705581665, + "reward_std": 0.06758980453014374, + "rewards/progression_diversity/mean": -0.0014180454891175032, + "rewards/progression_diversity/std": 0.010988151654601097, + "rewards/symbolic_reward_accuracy/mean": 0.330078125, + "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, + "rewards/symbolic_reward_partial_score/mean": 0.7512043714523315, + "rewards/symbolic_reward_partial_score/std": 0.2662599980831146, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0178977251052856, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 20.070125579833984, + "step": 3045 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2448933646082878, + "epoch": 4.881410256410256, + "grad_norm": 0.04779614508152008, + "learning_rate": 1e-06, + "loss": 0.1154, + "step": 3046 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.26386718451976776, + "epoch": 4.88301282051282, + "grad_norm": 0.013240370899438858, + "learning_rate": 1e-06, + "loss": 0.0541, + "step": 3047 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.25863589346408844, + "epoch": 4.884615384615385, + "grad_norm": 0.013798616826534271, + "learning_rate": 1e-06, + "loss": 0.0337, + "step": 3048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3673.0, + "completions/mean_length": 3313.994140625, + "completions/mean_terminated_length": 1992.9398193359375, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.23720625787973404, + "epoch": 4.886217948717949, + "frac_reward_zero_std": 0.34375, + "grad_norm": 2632.1220703125, + "learning_rate": 1e-06, + "loss": 0.1328, + "num_tokens": 1833494697.0, + "reward": 0.3145332932472229, + "reward_std": 0.07812836021184921, + "rewards/progression_diversity/mean": -0.0022365176118910313, + "rewards/progression_diversity/std": 0.015494197607040405, + "rewards/symbolic_reward_accuracy/mean": 0.205078125, + "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, + "rewards/symbolic_reward_partial_score/mean": 0.6689616441726685, + "rewards/symbolic_reward_partial_score/std": 0.2868025302886963, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.003543734550476, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 26.09931182861328, + "step": 3049 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2454667165875435, + "epoch": 4.887820512820513, + "grad_norm": 0.01591862179338932, + "learning_rate": 1e-06, + "loss": 0.0864, + "step": 3050 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.24274562299251556, + "epoch": 4.889423076923077, + "grad_norm": 0.19580645859241486, + "learning_rate": 1e-06, + "loss": 0.0845, + "step": 3051 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23570669442415237, + "epoch": 4.891025641025641, + "grad_norm": 0.03217773512005806, + "learning_rate": 1e-06, + "loss": 0.1089, + "step": 3052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3641.0, + "completions/mean_length": 2654.248046875, + "completions/mean_terminated_length": 1979.0142822265625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.24544284492731094, + "epoch": 4.892628205128205, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1223.9429931640625, + "learning_rate": 1e-06, + "loss": 0.1096, + "num_tokens": 1835634792.0, + "reward": 0.47855299711227417, + "reward_std": 0.09513897448778152, + "rewards/progression_diversity/mean": -0.0026136531960219145, + "rewards/progression_diversity/std": 0.020054563879966736, + "rewards/symbolic_reward_accuracy/mean": 0.40625, + "rewards/symbolic_reward_accuracy/std": 0.49161264300346375, + "rewards/symbolic_reward_partial_score/mean": 0.7983887195587158, + "rewards/symbolic_reward_partial_score/std": 0.2503470480442047, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.015138864517212, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 20.58639144897461, + "step": 3053 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.25392650067806244, + "epoch": 4.894230769230769, + "grad_norm": 0.018817421048879623, + "learning_rate": 1e-06, + "loss": 0.0569, + "step": 3054 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2459840551018715, + "epoch": 4.895833333333333, + "grad_norm": 0.0089957220479846, + "learning_rate": 1e-06, + "loss": 0.0989, + "step": 3055 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.24934467673301697, + "epoch": 4.897435897435898, + "grad_norm": 0.02581534907221794, + "learning_rate": 1e-06, + "loss": 0.0898, + "step": 3056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.115234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3635.0, + "completions/mean_length": 3645.541015625, + "completions/mean_terminated_length": 1986.4481201171875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.23428595066070557, + "epoch": 4.899038461538462, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1076.086669921875, + "learning_rate": 1e-06, + "loss": 0.1253, + "num_tokens": 1838393661.0, + "reward": 0.22710196673870087, + "reward_std": 0.09757393598556519, + "rewards/progression_diversity/mean": -0.0007409834070131183, + "rewards/progression_diversity/std": 0.006571199279278517, + "rewards/symbolic_reward_accuracy/mean": 0.0859375, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.6222655773162842, + "rewards/symbolic_reward_partial_score/std": 0.27893805503845215, + "rewards/tag_count_reward/mean": -0.111328125, + "rewards/tag_count_reward/std": 0.31484565138816833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9937641620635986, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 29.560943603515625, + "step": 3057 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2234666869044304, + "epoch": 4.9006410256410255, + "grad_norm": 0.025841714814305305, + "learning_rate": 1e-06, + "loss": 0.1654, + "step": 3058 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.23846197873353958, + "epoch": 4.902243589743589, + "grad_norm": 0.01864476129412651, + "learning_rate": 1e-06, + "loss": 0.0935, + "step": 3059 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.22488000988960266, + "epoch": 4.903846153846154, + "grad_norm": 0.20488475263118744, + "learning_rate": 1e-06, + "loss": 0.1767, + "step": 3060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3840.0, + "completions/mean_length": 2616.1171875, + "completions/mean_terminated_length": 1968.5479736328125, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "entropy": 0.2517363280057907, + "epoch": 4.905448717948718, + "frac_reward_zero_std": 0.40625, + "grad_norm": 162.09591674804688, + "learning_rate": 1e-06, + "loss": 0.0271, + "num_tokens": 1840538281.0, + "reward": 0.3412300944328308, + "reward_std": 0.05017785727977753, + "rewards/progression_diversity/mean": -4.024427471449599e-05, + "rewards/progression_diversity/std": 0.0005465570138767362, + "rewards/symbolic_reward_accuracy/mean": 0.21484375, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.7214192152023315, + "rewards/symbolic_reward_partial_score/std": 0.2416561096906662, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0312939882278442, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 772.0, + "sampling/sampling_logp_difference/mean": 12.591215133666992, + "step": 3061 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.24662530422210693, + "epoch": 4.907051282051282, + "grad_norm": 3917.875732421875, + "learning_rate": 1e-06, + "loss": 0.147, + "step": 3062 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2500121220946312, + "epoch": 4.908653846153846, + "grad_norm": 2733.346923828125, + "learning_rate": 1e-06, + "loss": 0.2713, + "step": 3063 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23512417078018188, + "epoch": 4.910256410256411, + "grad_norm": 0.14125868678092957, + "learning_rate": 1e-06, + "loss": 0.1516, + "step": 3064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3573.0, + "completions/mean_length": 3311.6796875, + "completions/mean_terminated_length": 1928.2159423828125, + "completions/min_length": 385.0, + "completions/min_terminated_length": 385.0, + "entropy": 0.2293601930141449, + "epoch": 4.9118589743589745, + "frac_reward_zero_std": 0.34375, + "grad_norm": 1017.6434326171875, + "learning_rate": 1e-06, + "loss": 0.0998, + "num_tokens": 1843132549.0, + "reward": 0.29114842414855957, + "reward_std": 0.08108247071504593, + "rewards/progression_diversity/mean": -0.00039431118057109416, + "rewards/progression_diversity/std": 0.004132647532969713, + "rewards/symbolic_reward_accuracy/mean": 0.1640625, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.6723307371139526, + "rewards/symbolic_reward_partial_score/std": 0.2803674042224884, + "rewards/tag_count_reward/mean": -0.08984375, + "rewards/tag_count_reward/std": 0.2862374484539032, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.004433274269104, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 24.679431915283203, + "step": 3065 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.22065846621990204, + "epoch": 4.913461538461538, + "grad_norm": 0.017783688381314278, + "learning_rate": 1e-06, + "loss": 0.1363, + "step": 3066 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23686835914850235, + "epoch": 4.915064102564102, + "grad_norm": 0.03829358518123627, + "learning_rate": 1e-06, + "loss": 0.0933, + "step": 3067 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22281410545110703, + "epoch": 4.916666666666667, + "grad_norm": 0.013630473986268044, + "learning_rate": 1e-06, + "loss": 0.1489, + "step": 3068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3399.0, + "completions/mean_length": 3175.4453125, + "completions/mean_terminated_length": 1933.615478515625, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.23225311189889908, + "epoch": 4.918269230769231, + "frac_reward_zero_std": 0.34375, + "grad_norm": 1236.8885498046875, + "learning_rate": 1e-06, + "loss": 0.0914, + "num_tokens": 1845645129.0, + "reward": 0.29391324520111084, + "reward_std": 0.04953677952289581, + "rewards/progression_diversity/mean": -0.0002773654996417463, + "rewards/progression_diversity/std": 0.003748962190002203, + "rewards/symbolic_reward_accuracy/mean": 0.15625, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.6932617425918579, + "rewards/symbolic_reward_partial_score/std": 0.2669588029384613, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.011780023574829, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 22.082910537719727, + "step": 3069 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.22503124177455902, + "epoch": 4.919871794871795, + "grad_norm": 0.12676434218883514, + "learning_rate": 1e-06, + "loss": 0.1061, + "step": 3070 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22606287896633148, + "epoch": 4.921474358974359, + "grad_norm": 0.02233606018126011, + "learning_rate": 1e-06, + "loss": 0.106, + "step": 3071 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2343372032046318, + "epoch": 4.923076923076923, + "grad_norm": 0.014616597443819046, + "learning_rate": 1e-06, + "loss": 0.1043, + "step": 3072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3779.0, + "completions/mean_length": 3257.8828125, + "completions/mean_terminated_length": 1993.053466796875, + "completions/min_length": 304.0, + "completions/min_terminated_length": 304.0, + "entropy": 0.21944886445999146, + "epoch": 4.924679487179487, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1409.2808837890625, + "learning_rate": 1e-06, + "loss": 0.1516, + "num_tokens": 1848203293.0, + "reward": 0.21257862448692322, + "reward_std": 0.07666586339473724, + "rewards/progression_diversity/mean": -0.00044064479880034924, + "rewards/progression_diversity/std": 0.003953089937567711, + "rewards/symbolic_reward_accuracy/mean": 0.06640625, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.6037923097610474, + "rewards/symbolic_reward_partial_score/std": 0.2407861053943634, + "rewards/tag_count_reward/mean": -0.083984375, + "rewards/tag_count_reward/std": 0.2776356339454651, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9995574355125427, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 28.18425941467285, + "step": 3073 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2246202975511551, + "epoch": 4.926282051282051, + "grad_norm": 0.31535470485687256, + "learning_rate": 1e-06, + "loss": 0.1254, + "step": 3074 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.23829004913568497, + "epoch": 4.927884615384615, + "grad_norm": 0.02114877477288246, + "learning_rate": 1e-06, + "loss": 0.0652, + "step": 3075 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.22181038558483124, + "epoch": 4.92948717948718, + "grad_norm": 0.021001050248742104, + "learning_rate": 1e-06, + "loss": 0.1621, + "step": 3076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3972.0, + "completions/mean_length": 2757.482421875, + "completions/mean_terminated_length": 1998.892822265625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "entropy": 0.2369692623615265, + "epoch": 4.931089743589744, + "frac_reward_zero_std": 0.34375, + "grad_norm": 1576.586181640625, + "learning_rate": 1e-06, + "loss": 0.1226, + "num_tokens": 1850362948.0, + "reward": 0.3155801594257355, + "reward_std": 0.0678568109869957, + "rewards/progression_diversity/mean": -9.189260163111612e-05, + "rewards/progression_diversity/std": 0.0013428285019472241, + "rewards/symbolic_reward_accuracy/mean": 0.177734375, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.7120931148529053, + "rewards/symbolic_reward_partial_score/std": 0.23257166147232056, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.022636890411377, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 18.015220642089844, + "step": 3077 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2449949011206627, + "epoch": 4.9326923076923075, + "grad_norm": 986.05517578125, + "learning_rate": 1e-06, + "loss": 0.1112, + "step": 3078 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2491198480129242, + "epoch": 4.934294871794872, + "grad_norm": 0.01813652738928795, + "learning_rate": 1e-06, + "loss": 0.0695, + "step": 3079 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.24884822964668274, + "epoch": 4.935897435897436, + "grad_norm": 0.030230067670345306, + "learning_rate": 1e-06, + "loss": 0.0596, + "step": 3080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3653.0, + "completions/mean_length": 3115.162109375, + "completions/mean_terminated_length": 2051.41552734375, + "completions/min_length": 859.0, + "completions/min_terminated_length": 859.0, + "entropy": 0.2273581624031067, + "epoch": 4.9375, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1620.06494140625, + "learning_rate": 1e-06, + "loss": 0.1231, + "num_tokens": 1852808343.0, + "reward": 0.3891579508781433, + "reward_std": 0.09541543573141098, + "rewards/progression_diversity/mean": -0.0007119444198906422, + "rewards/progression_diversity/std": 0.004969864152371883, + "rewards/symbolic_reward_accuracy/mean": 0.298828125, + "rewards/symbolic_reward_accuracy/std": 0.45819199085235596, + "rewards/symbolic_reward_partial_score/mean": 0.7229980230331421, + "rewards/symbolic_reward_partial_score/std": 0.27403724193573, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0014233589172363, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 27.424591064453125, + "step": 3081 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.22573982179164886, + "epoch": 4.939102564102564, + "grad_norm": 0.013195838779211044, + "learning_rate": 1e-06, + "loss": 0.1688, + "step": 3082 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.22488176822662354, + "epoch": 4.940705128205128, + "grad_norm": 0.01642797514796257, + "learning_rate": 1e-06, + "loss": 0.1161, + "step": 3083 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.22580020129680634, + "epoch": 4.9423076923076925, + "grad_norm": 0.030097253620624542, + "learning_rate": 1e-06, + "loss": 0.1038, + "step": 3084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3909.0, + "completions/mean_length": 2938.462890625, + "completions/mean_terminated_length": 2012.1524658203125, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "entropy": 0.24025756865739822, + "epoch": 4.943910256410256, + "frac_reward_zero_std": 0.3125, + "grad_norm": 427.4413757324219, + "learning_rate": 1e-06, + "loss": 0.0368, + "num_tokens": 1855159284.0, + "reward": 0.3869500756263733, + "reward_std": 0.09471500664949417, + "rewards/progression_diversity/mean": -0.0007927106926217675, + "rewards/progression_diversity/std": 0.0072387754917144775, + "rewards/symbolic_reward_accuracy/mean": 0.28515625, + "rewards/symbolic_reward_accuracy/std": 0.45193037390708923, + "rewards/symbolic_reward_partial_score/mean": 0.7403808832168579, + "rewards/symbolic_reward_partial_score/std": 0.2626368999481201, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.011838674545288, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 22.941444396972656, + "step": 3085 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2293272241950035, + "epoch": 4.94551282051282, + "grad_norm": 0.014672359451651573, + "learning_rate": 1e-06, + "loss": 0.0897, + "step": 3086 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.23675543069839478, + "epoch": 4.947115384615385, + "grad_norm": 0.03274073079228401, + "learning_rate": 1e-06, + "loss": 2.9293, + "step": 3087 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.21905168890953064, + "epoch": 4.948717948717949, + "grad_norm": 0.013550637289881706, + "learning_rate": 1e-06, + "loss": 0.181, + "step": 3088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3887.0, + "completions/mean_length": 2637.939453125, + "completions/mean_terminated_length": 1991.396728515625, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.23665142059326172, + "epoch": 4.950320512820513, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1990.0498046875, + "learning_rate": 1e-06, + "loss": 0.1275, + "num_tokens": 1857301445.0, + "reward": 0.35283172130584717, + "reward_std": 0.07654442638158798, + "rewards/progression_diversity/mean": -0.0010098177008330822, + "rewards/progression_diversity/std": 0.007632537744939327, + "rewards/symbolic_reward_accuracy/mean": 0.23828125, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.7145507335662842, + "rewards/symbolic_reward_partial_score/std": 0.2468079924583435, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0146788358688354, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 22.055877685546875, + "step": 3089 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2322361320257187, + "epoch": 4.951923076923077, + "grad_norm": 0.017291007563471794, + "learning_rate": 1e-06, + "loss": 0.1592, + "step": 3090 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24952280521392822, + "epoch": 4.953525641025641, + "grad_norm": 0.010401350446045399, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 3091 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2399914711713791, + "epoch": 4.955128205128205, + "grad_norm": 0.35759103298187256, + "learning_rate": 1e-06, + "loss": 0.0707, + "step": 3092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3963.0, + "completions/mean_length": 3135.34375, + "completions/mean_terminated_length": 2042.9598388671875, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "entropy": 0.2194850966334343, + "epoch": 4.956730769230769, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1517.0322265625, + "learning_rate": 1e-06, + "loss": 0.2334, + "num_tokens": 1859834453.0, + "reward": 0.2989712357521057, + "reward_std": 0.10124389827251434, + "rewards/progression_diversity/mean": -0.0013144873082637787, + "rewards/progression_diversity/std": 0.011923530139029026, + "rewards/symbolic_reward_accuracy/mean": 0.185546875, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.6483073234558105, + "rewards/symbolic_reward_partial_score/std": 0.26260486245155334, + "rewards/tag_count_reward/mean": -0.068359375, + "rewards/tag_count_reward/std": 0.25260838866233826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9993857145309448, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 29.454757690429688, + "step": 3093 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23910460621118546, + "epoch": 4.958333333333333, + "grad_norm": 0.11438003182411194, + "learning_rate": 1e-06, + "loss": 0.0364, + "step": 3094 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2314603254199028, + "epoch": 4.959935897435898, + "grad_norm": 0.013372881338000298, + "learning_rate": 1e-06, + "loss": 0.1093, + "step": 3095 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.22888445109128952, + "epoch": 4.961538461538462, + "grad_norm": 0.04182210564613342, + "learning_rate": 1e-06, + "loss": 0.1042, + "step": 3096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3800.0, + "completions/mean_length": 3504.990234375, + "completions/mean_terminated_length": 2049.10205078125, + "completions/min_length": 499.0, + "completions/min_terminated_length": 499.0, + "entropy": 0.21291837096214294, + "epoch": 4.9631410256410255, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1515.2686767578125, + "learning_rate": 1e-06, + "loss": 0.1687, + "num_tokens": 1862539136.0, + "reward": 0.38587409257888794, + "reward_std": 0.12614938616752625, + "rewards/progression_diversity/mean": -0.0014604174066334963, + "rewards/progression_diversity/std": 0.010783915407955647, + "rewards/symbolic_reward_accuracy/mean": 0.310546875, + "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, + "rewards/symbolic_reward_partial_score/mean": 0.6984049677848816, + "rewards/symbolic_reward_partial_score/std": 0.3126681447029114, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896366000175476, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 34.82575607299805, + "step": 3097 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.23361308127641678, + "epoch": 4.964743589743589, + "grad_norm": 99630.9453125, + "learning_rate": 1e-06, + "loss": 1.774, + "step": 3098 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.21884525567293167, + "epoch": 4.966346153846154, + "grad_norm": 0.037266023457050323, + "learning_rate": 1e-06, + "loss": 0.1422, + "step": 3099 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.21776709705591202, + "epoch": 4.967948717948718, + "grad_norm": 0.022990232333540916, + "learning_rate": 1e-06, + "loss": 0.1704, + "step": 3100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3794.0, + "completions/mean_length": 3065.734375, + "completions/mean_terminated_length": 1998.0252685546875, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.2290486916899681, + "epoch": 4.969551282051282, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1406.708984375, + "learning_rate": 1e-06, + "loss": 0.0795, + "num_tokens": 1864989176.0, + "reward": 0.27424493432044983, + "reward_std": 0.08562411367893219, + "rewards/progression_diversity/mean": -0.0012889985227957368, + "rewards/progression_diversity/std": 0.011297719553112984, + "rewards/symbolic_reward_accuracy/mean": 0.134765625, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.6680989265441895, + "rewards/symbolic_reward_partial_score/std": 0.251154363155365, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.004550576210022, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 27.251415252685547, + "step": 3101 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.21562334895133972, + "epoch": 4.971153846153846, + "grad_norm": 0.22579920291900635, + "learning_rate": 1e-06, + "loss": 0.1258, + "step": 3102 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.22457308322191238, + "epoch": 4.972756410256411, + "grad_norm": 0.023017656058073044, + "learning_rate": 1e-06, + "loss": 423466112.0, + "step": 3103 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2277710661292076, + "epoch": 4.9743589743589745, + "grad_norm": 0.0158623605966568, + "learning_rate": 1e-06, + "loss": 0.088, + "step": 3104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3392.0, + "completions/mean_length": 2962.814453125, + "completions/mean_terminated_length": 1947.766845703125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.22295588999986649, + "epoch": 4.975961538461538, + "frac_reward_zero_std": 0.28125, + "grad_norm": 615.591796875, + "learning_rate": 1e-06, + "loss": 0.1338, + "num_tokens": 1867381977.0, + "reward": 0.31032562255859375, + "reward_std": 0.08792345225811005, + "rewards/progression_diversity/mean": -0.001620155293494463, + "rewards/progression_diversity/std": 0.014947640709578991, + "rewards/symbolic_reward_accuracy/mean": 0.16796875, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.7187174558639526, + "rewards/symbolic_reward_partial_score/std": 0.2543519139289856, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0067986249923706, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 26.045305252075195, + "step": 3105 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2358352467417717, + "epoch": 4.977564102564102, + "grad_norm": 0.02432454563677311, + "learning_rate": 1e-06, + "loss": 0.0836, + "step": 3106 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.22167550772428513, + "epoch": 4.979166666666667, + "grad_norm": 0.01078137755393982, + "learning_rate": 1e-06, + "loss": 0.1256, + "step": 3107 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.22863591462373734, + "epoch": 4.980769230769231, + "grad_norm": 0.019278274849057198, + "learning_rate": 1e-06, + "loss": 0.0846, + "step": 3108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3294.0, + "completions/mean_length": 2566.49609375, + "completions/mean_terminated_length": 1857.1787109375, + "completions/min_length": 900.0, + "completions/min_terminated_length": 900.0, + "entropy": 0.25486068427562714, + "epoch": 4.982371794871795, + "frac_reward_zero_std": 0.34375, + "grad_norm": 590.8904418945312, + "learning_rate": 1e-06, + "loss": 0.0226, + "num_tokens": 1869483767.0, + "reward": 0.3811664581298828, + "reward_std": 0.06205814331769943, + "rewards/progression_diversity/mean": -0.0015223543159663677, + "rewards/progression_diversity/std": 0.01551423966884613, + "rewards/symbolic_reward_accuracy/mean": 0.267578125, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.7497721314430237, + "rewards/symbolic_reward_partial_score/std": 0.24262651801109314, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0239535570144653, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 18.67233657836914, + "step": 3109 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.23615090548992157, + "epoch": 4.983974358974359, + "grad_norm": 37597.58984375, + "learning_rate": 1e-06, + "loss": 0.8181, + "step": 3110 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.24501872807741165, + "epoch": 4.985576923076923, + "grad_norm": 0.01006036065518856, + "learning_rate": 1e-06, + "loss": 0.0984, + "step": 3111 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.24807747453451157, + "epoch": 4.987179487179487, + "grad_norm": 0.02072441577911377, + "learning_rate": 1e-06, + "loss": 0.0622, + "step": 3112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3321.0, + "completions/mean_length": 2686.251953125, + "completions/mean_terminated_length": 1923.697021484375, + "completions/min_length": 427.0, + "completions/min_terminated_length": 427.0, + "entropy": 0.234246626496315, + "epoch": 4.988782051282051, + "frac_reward_zero_std": 0.28125, + "grad_norm": 554.318603515625, + "learning_rate": 1e-06, + "loss": 0.0841, + "num_tokens": 1871779112.0, + "reward": 0.3497743010520935, + "reward_std": 0.06554560363292694, + "rewards/progression_diversity/mean": -0.0010880029294639826, + "rewards/progression_diversity/std": 0.008479480631649494, + "rewards/symbolic_reward_accuracy/mean": 0.236328125, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.7102213501930237, + "rewards/symbolic_reward_partial_score/std": 0.2495347112417221, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0156009197235107, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 21.600927352905273, + "step": 3113 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2407403290271759, + "epoch": 4.990384615384615, + "grad_norm": 0.019321072846651077, + "learning_rate": 1e-06, + "loss": 0.0487, + "step": 3114 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.23200398683547974, + "epoch": 4.99198717948718, + "grad_norm": 0.015974221751093864, + "learning_rate": 1e-06, + "loss": 0.1028, + "step": 3115 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23310399055480957, + "epoch": 4.993589743589744, + "grad_norm": 0.016877546906471252, + "learning_rate": 1e-06, + "loss": 0.1081, + "step": 3116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3118.0, + "completions/mean_length": 2393.515625, + "completions/mean_terminated_length": 1883.740966796875, + "completions/min_length": 592.0, + "completions/min_terminated_length": 592.0, + "entropy": 0.24390202015638351, + "epoch": 4.9951923076923075, + "frac_reward_zero_std": 0.5, + "grad_norm": 1346.7171630859375, + "learning_rate": 1e-06, + "loss": 0.0759, + "num_tokens": 1873829408.0, + "reward": 0.3640533685684204, + "reward_std": 0.027338774874806404, + "rewards/progression_diversity/mean": -0.0004282901354599744, + "rewards/progression_diversity/std": 0.0046591991558671, + "rewards/symbolic_reward_accuracy/mean": 0.25, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.7239420413970947, + "rewards/symbolic_reward_partial_score/std": 0.23576416075229645, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0342185497283936, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 776.0, + "sampling/sampling_logp_difference/mean": 13.47022533416748, + "step": 3117 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.24693401902914047, + "epoch": 4.996794871794872, + "grad_norm": 2261.0185546875, + "learning_rate": 1e-06, + "loss": 0.0913, + "step": 3118 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24139049649238586, + "epoch": 4.998397435897436, + "grad_norm": 0.024692002683877945, + "learning_rate": 1e-06, + "loss": 0.0602, + "step": 3119 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2525310665369034, + "epoch": 5.0, + "grad_norm": 0.013732725754380226, + "learning_rate": 1e-06, + "loss": 0.0156, + "step": 3120 + }, + { + "epoch": 5.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.063720703125, + "eval_completions/max_length": 16002.59375, + "eval_completions/max_terminated_length": 3275.65625, + "eval_completions/mean_length": 2817.200927734375, + "eval_completions/mean_terminated_length": 1894.647304534912, + "eval_completions/min_length": 834.84375, + "eval_completions/min_terminated_length": 834.84375, + "eval_entropy": 0.22543939435854554, + "eval_frac_reward_zero_std": 0.28125, + "eval_loss": 0.04206552356481552, + "eval_num_tokens": 1873829408.0, + "eval_reward": 0.24393098056316376, + "eval_reward_std": 0.05101948481751606, + "eval_rewards/progression_diversity/mean": -0.001373334529262138, + "eval_rewards/progression_diversity/std": 0.01101221157978216, + "eval_rewards/symbolic_reward_accuracy/mean": 0.0966796875, + "eval_rewards/symbolic_reward_accuracy/std": 0.21552498079836369, + "eval_rewards/symbolic_reward_partial_score/mean": 0.6393208876252174, + "eval_rewards/symbolic_reward_partial_score/std": 0.22751979576423764, + "eval_rewards/tag_count_reward/mean": -0.05859375, + "eval_rewards/tag_count_reward/std": 0.21680113021284342, + "eval_runtime": 4356.8719, + "eval_samples_per_second": 0.057, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0280762165784836, + "eval_sampling/importance_sampling_ratio/min": 1.5225041253212082e-39, + "eval_sampling/sampling_logp_difference/max": 754.4348351955414, + "eval_sampling/sampling_logp_difference/mean": 16.028895314550027, + "eval_steps_per_second": 0.0, + "step": 3120 + }, + { + "epoch": 5.0, + "step": 3120, + "total_flos": 0.0, + "train_loss": 235461599.143655, + "train_runtime": 175420.2215, + "train_samples_per_second": 0.143, + "train_steps_per_second": 0.018 + } + ], + "logging_steps": 1, + "max_steps": 3120, + "num_input_tokens_seen": 1873829408, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}