{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 3120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 8497.0, "completions/mean_length": 7881.193359375, "completions/mean_terminated_length": 1474.9691162109375, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "entropy": 0.14000436663627625, "epoch": 0.0016025641025641025, "frac_reward_zero_std": 0.03125, "grad_norm": 568.662109375, "learning_rate": 1e-06, "loss": 0.274, "num_tokens": 4883523.0, "reward": 0.28620433807373047, "reward_std": 0.19634175300598145, "rewards/progression_diversity/mean": -0.09636208415031433, "rewards/progression_diversity/std": 0.1318667083978653, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.5490233898162842, "rewards/symbolic_reward_partial_score/std": 0.4077247977256775, "rewards/tag_count_reward/mean": -0.369140625, "rewards/tag_count_reward/std": 0.4830440282821655, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9890698790550232, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 412.0, "sampling/sampling_logp_difference/mean": 11.238412857055664, "step": 1 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.15292271971702576, "epoch": 0.003205128205128205, "grad_norm": 1453.4033203125, "learning_rate": 1e-06, "loss": 0.2567, "step": 2 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.1447555311024189, "epoch": 0.004807692307692308, "grad_norm": 155.9544219970703, "learning_rate": 1e-06, "loss": 0.2182, "step": 3 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3828125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.15076642483472824, "epoch": 0.00641025641025641, "grad_norm": 180.8573455810547, "learning_rate": 1e-06, "loss": 0.2647, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.47265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8724.0, "completions/mean_length": 8531.75, "completions/mean_terminated_length": 1493.807373046875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "entropy": 0.141545832157135, "epoch": 0.008012820512820512, "frac_reward_zero_std": 0.0, "grad_norm": 267.66961669921875, "learning_rate": 1e-06, "loss": 0.1422, "num_tokens": 10076899.0, "reward": 0.27696967124938965, "reward_std": 0.21476207673549652, "rewards/progression_diversity/mean": -0.10186256468296051, "rewards/progression_diversity/std": 0.1328817903995514, "rewards/symbolic_reward_accuracy/mean": 0.267578125, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.5268880128860474, "rewards/symbolic_reward_partial_score/std": 0.4123326241970062, "rewards/tag_count_reward/mean": -0.40625, "rewards/tag_count_reward/std": 0.49161264300346375, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.984929621219635, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 412.0, "sampling/sampling_logp_difference/mean": 11.871856689453125, "step": 5 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.14717822521924973, "epoch": 0.009615384615384616, "grad_norm": 116.89068603515625, "learning_rate": 1e-06, "loss": 0.2298, "step": 6 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.13329345732927322, "epoch": 0.011217948717948718, "grad_norm": 971.0422973632812, "learning_rate": 1e-06, "loss": 0.3372, "step": 7 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.15128736197948456, "epoch": 0.01282051282051282, "grad_norm": 416.9343566894531, "learning_rate": 1e-06, "loss": 0.3162, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.44140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8558.0, "completions/mean_length": 8034.72265625, "completions/mean_terminated_length": 1437.0419921875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.12974420934915543, "epoch": 0.014423076923076924, "frac_reward_zero_std": 0.0, "grad_norm": 525.6014404296875, "learning_rate": 1e-06, "loss": 0.3298, "num_tokens": 15093045.0, "reward": 0.22926117479801178, "reward_std": 0.21232059597969055, "rewards/progression_diversity/mean": -0.10122619569301605, "rewards/progression_diversity/std": 0.1385258138179779, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.5384114980697632, "rewards/symbolic_reward_partial_score/std": 0.38595736026763916, "rewards/tag_count_reward/mean": -0.390625, "rewards/tag_count_reward/std": 0.48836761713027954, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9804725050926208, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 412.0, "sampling/sampling_logp_difference/mean": 12.684000968933105, "step": 9 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.1481311321258545, "epoch": 0.016025641025641024, "grad_norm": 207.61981201171875, "learning_rate": 1e-06, "loss": 0.2655, "step": 10 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.14685294777154922, "epoch": 0.017628205128205128, "grad_norm": 452.76580810546875, "learning_rate": 1e-06, "loss": 0.3486, "step": 11 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.13027992472052574, "epoch": 0.019230769230769232, "grad_norm": 58.394657135009766, "learning_rate": 1e-06, "loss": 0.3177, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.384765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4718.0, "completions/mean_length": 7190.44921875, "completions/mean_terminated_length": 1440.831787109375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.145263209939003, "epoch": 0.020833333333333332, "frac_reward_zero_std": 0.0, "grad_norm": 442.4850158691406, "learning_rate": 1e-06, "loss": 0.2593, "num_tokens": 19648011.0, "reward": 0.234406977891922, "reward_std": 0.19648107886314392, "rewards/progression_diversity/mean": -0.08762294054031372, "rewards/progression_diversity/std": 0.1332884430885315, "rewards/symbolic_reward_accuracy/mean": 0.16796875, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.5577148199081421, "rewards/symbolic_reward_partial_score/std": 0.37337514758110046, "rewards/tag_count_reward/mean": -0.328125, "rewards/tag_count_reward/std": 0.4699897766113281, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.983834981918335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 414.0, "sampling/sampling_logp_difference/mean": 12.08355712890625, "step": 13 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.14843080937862396, "epoch": 0.022435897435897436, "grad_norm": 94.6893081665039, "learning_rate": 1e-06, "loss": 0.2235, "step": 14 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.16313014179468155, "epoch": 0.02403846153846154, "grad_norm": 207.52377319335938, "learning_rate": 1e-06, "loss": 0.2741, "step": 15 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.15617449581623077, "epoch": 0.02564102564102564, "grad_norm": 304.16375732421875, "learning_rate": 1e-06, "loss": 0.3583, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.349609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 8593.0, "completions/mean_length": 6609.98046875, "completions/mean_terminated_length": 1356.078125, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.20762895047664642, "epoch": 0.027243589743589744, "frac_reward_zero_std": 0.0625, "grad_norm": 390.9715576171875, "learning_rate": 1e-06, "loss": 0.2115, "num_tokens": 23865953.0, "reward": 0.32011666893959045, "reward_std": 0.19630393385887146, "rewards/progression_diversity/mean": -0.08647972345352173, "rewards/progression_diversity/std": 0.1337437778711319, "rewards/symbolic_reward_accuracy/mean": 0.28515625, "rewards/symbolic_reward_accuracy/std": 0.45193037390708923, "rewards/symbolic_reward_partial_score/mean": 0.6044433116912842, "rewards/symbolic_reward_partial_score/std": 0.38585761189460754, "rewards/tag_count_reward/mean": -0.314453125, "rewards/tag_count_reward/std": 0.4647517800331116, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.987565815448761, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 416.0, "sampling/sampling_logp_difference/mean": 11.790186882019043, "step": 17 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.16485845297574997, "epoch": 0.028846153846153848, "grad_norm": 20.569774627685547, "learning_rate": 1e-06, "loss": 0.3498, "step": 18 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.14989716559648514, "epoch": 0.030448717948717948, "grad_norm": 29.013126373291016, "learning_rate": 1e-06, "loss": 0.2945, "step": 19 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.16761308163404465, "epoch": 0.03205128205128205, "grad_norm": 17.99753761291504, "learning_rate": 1e-06, "loss": 0.2861, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.396484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15963.0, "completions/mean_length": 7455.453125, "completions/mean_terminated_length": 1589.7735595703125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 0.1430811509490013, "epoch": 0.03365384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 338.8027648925781, "learning_rate": 1e-06, "loss": 0.3612, "num_tokens": 28525033.0, "reward": 0.2473517507314682, "reward_std": 0.21615710854530334, "rewards/progression_diversity/mean": -0.09685845673084259, "rewards/progression_diversity/std": 0.13814808428287506, "rewards/symbolic_reward_accuracy/mean": 0.193359375, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.5562499761581421, "rewards/symbolic_reward_partial_score/std": 0.38326844573020935, "rewards/tag_count_reward/mean": -0.345703125, "rewards/tag_count_reward/std": 0.4760620892047882, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9782972931861877, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 416.0, "sampling/sampling_logp_difference/mean": 13.17431640625, "step": 21 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.15020054578781128, "epoch": 0.035256410256410256, "grad_norm": 72.29728698730469, "learning_rate": 1e-06, "loss": 0.2999, "step": 22 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.14304012060165405, "epoch": 0.03685897435897436, "grad_norm": 74.60310363769531, "learning_rate": 1e-06, "loss": 0.2514, "step": 23 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.1455524042248726, "epoch": 0.038461538461538464, "grad_norm": 23.46770668029785, "learning_rate": 1e-06, "loss": 0.2941, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.431640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8141.0, "completions/mean_length": 7897.568359375, "completions/mean_terminated_length": 1452.54638671875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.15165143460035324, "epoch": 0.04006410256410257, "frac_reward_zero_std": 0.03125, "grad_norm": 739.3831176757812, "learning_rate": 1e-06, "loss": 0.2292, "num_tokens": 33441724.0, "reward": 0.2494029402732849, "reward_std": 0.1849609613418579, "rewards/progression_diversity/mean": -0.10511670261621475, "rewards/progression_diversity/std": 0.14027173817157745, "rewards/symbolic_reward_accuracy/mean": 0.208984375, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.5431803464889526, "rewards/symbolic_reward_partial_score/std": 0.38958677649497986, "rewards/tag_count_reward/mean": -0.37890625, "rewards/tag_count_reward/std": 0.4855891764163971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9824032783508301, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 416.0, "sampling/sampling_logp_difference/mean": 12.627696990966797, "step": 25 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.14603624492883682, "epoch": 0.041666666666666664, "grad_norm": 473.4931335449219, "learning_rate": 1e-06, "loss": 0.2678, "step": 26 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3515625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.13400625064969063, "epoch": 0.04326923076923077, "grad_norm": 485.1720886230469, "learning_rate": 1e-06, "loss": 0.3149, "step": 27 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.14134199172258377, "epoch": 0.04487179487179487, "grad_norm": 265.03302001953125, "learning_rate": 1e-06, "loss": 0.2932, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5220.0, "completions/mean_length": 8731.86328125, "completions/mean_terminated_length": 1543.492431640625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.12327419593930244, "epoch": 0.046474358974358976, "frac_reward_zero_std": 0.03125, "grad_norm": 270.61407470703125, "learning_rate": 1e-06, "loss": 0.2511, "num_tokens": 38830534.0, "reward": 0.20257076621055603, "reward_std": 0.1862163543701172, "rewards/progression_diversity/mean": -0.11499577015638351, "rewards/progression_diversity/std": 0.14146001636981964, "rewards/symbolic_reward_accuracy/mean": 0.166015625, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.49026691913604736, "rewards/symbolic_reward_partial_score/std": 0.3924698829650879, "rewards/tag_count_reward/mean": -0.4296875, "rewards/tag_count_reward/std": 0.4955156147480011, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.978987991809845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 416.0, "sampling/sampling_logp_difference/mean": 13.129805564880371, "step": 29 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.13143375515937805, "epoch": 0.04807692307692308, "grad_norm": 348.0553283691406, "learning_rate": 1e-06, "loss": 0.2894, "step": 30 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.1378588154911995, "epoch": 0.049679487179487176, "grad_norm": 297.4830322265625, "learning_rate": 1e-06, "loss": 0.2873, "step": 31 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.4765625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.11010397225618362, "epoch": 0.05128205128205128, "grad_norm": 120.72930145263672, "learning_rate": 1e-06, "loss": 0.3805, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.412109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 10000.0, "completions/mean_length": 7616.541015625, "completions/mean_terminated_length": 1470.5814208984375, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "entropy": 0.15361008048057556, "epoch": 0.052884615384615384, "frac_reward_zero_std": 0.09375, "grad_norm": 288.5027160644531, "learning_rate": 1e-06, "loss": 0.2605, "num_tokens": 43573467.0, "reward": 0.26951634883880615, "reward_std": 0.16951517760753632, "rewards/progression_diversity/mean": -0.10158812254667282, "rewards/progression_diversity/std": 0.14149615168571472, "rewards/symbolic_reward_accuracy/mean": 0.236328125, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.549560546875, "rewards/symbolic_reward_partial_score/std": 0.393799364566803, "rewards/tag_count_reward/mean": -0.361328125, "rewards/tag_count_reward/std": 0.48085519671440125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9846019744873047, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 420.0, "sampling/sampling_logp_difference/mean": 12.345019340515137, "step": 33 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.1322014182806015, "epoch": 0.05448717948717949, "grad_norm": 719.1787719726562, "learning_rate": 1e-06, "loss": 0.2974, "step": 34 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.1471644788980484, "epoch": 0.05608974358974359, "grad_norm": 506.037109375, "learning_rate": 1e-06, "loss": 0.2677, "step": 35 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.144605353474617, "epoch": 0.057692307692307696, "grad_norm": 172.5960693359375, "learning_rate": 1e-06, "loss": 0.2405, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.376953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8070.0, "completions/mean_length": 7113.50390625, "completions/mean_terminated_length": 1504.7083740234375, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.15522907674312592, "epoch": 0.05929487179487179, "frac_reward_zero_std": 0.0625, "grad_norm": 546.6926879882812, "learning_rate": 1e-06, "loss": 0.314, "num_tokens": 48071261.0, "reward": 0.29389289021492004, "reward_std": 0.20416641235351562, "rewards/progression_diversity/mean": -0.09996913373470306, "rewards/progression_diversity/std": 0.1435985416173935, "rewards/symbolic_reward_accuracy/mean": 0.25390625, "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, "rewards/symbolic_reward_partial_score/mean": 0.5851888656616211, "rewards/symbolic_reward_partial_score/std": 0.38423076272010803, "rewards/tag_count_reward/mean": -0.330078125, "rewards/tag_count_reward/std": 0.47070086002349854, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879682660102844, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 420.0, "sampling/sampling_logp_difference/mean": 12.075263023376465, "step": 37 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.1799706146121025, "epoch": 0.060897435897435896, "grad_norm": 217.79710388183594, "learning_rate": 1e-06, "loss": 0.2229, "step": 38 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.1559307500720024, "epoch": 0.0625, "grad_norm": 349.0166931152344, "learning_rate": 1e-06, "loss": 0.2719, "step": 39 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.15247832983732224, "epoch": 0.0641025641025641, "grad_norm": 172.07949829101562, "learning_rate": 1e-06, "loss": 0.319, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.427734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4535.0, "completions/mean_length": 7818.34375, "completions/mean_terminated_length": 1416.0272216796875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "entropy": 0.13341034203767776, "epoch": 0.06570512820512821, "frac_reward_zero_std": 0.03125, "grad_norm": 447.7645568847656, "learning_rate": 1e-06, "loss": 0.273, "num_tokens": 53003245.0, "reward": 0.218738853931427, "reward_std": 0.2060059905052185, "rewards/progression_diversity/mean": -0.1119544506072998, "rewards/progression_diversity/std": 0.1507621854543686, "rewards/symbolic_reward_accuracy/mean": 0.171875, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.5076009035110474, "rewards/symbolic_reward_partial_score/std": 0.3918800354003906, "rewards/tag_count_reward/mean": -0.35546875, "rewards/tag_count_reward/std": 0.47912323474884033, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9806472659111023, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 420.0, "sampling/sampling_logp_difference/mean": 13.268798828125, "step": 41 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.14531062543392181, "epoch": 0.0673076923076923, "grad_norm": 1331.5009765625, "learning_rate": 1e-06, "loss": 0.3872, "step": 42 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.1627102643251419, "epoch": 0.06891025641025642, "grad_norm": 724.1676025390625, "learning_rate": 1e-06, "loss": 0.3012, "step": 43 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.16679170727729797, "epoch": 0.07051282051282051, "grad_norm": 689.5390625, "learning_rate": 1e-06, "loss": 0.3947, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.369140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4762.0, "completions/mean_length": 6941.892578125, "completions/mean_terminated_length": 1416.9442138671875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.1360173150897026, "epoch": 0.07211538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 262.25537109375, "learning_rate": 1e-06, "loss": 0.3696, "num_tokens": 57407846.0, "reward": 0.32967936992645264, "reward_std": 0.2605833411216736, "rewards/progression_diversity/mean": -0.10432835668325424, "rewards/progression_diversity/std": 0.15316592156887054, "rewards/symbolic_reward_accuracy/mean": 0.302734375, "rewards/symbolic_reward_accuracy/std": 0.45989060401916504, "rewards/symbolic_reward_partial_score/mean": 0.6082682609558105, "rewards/symbolic_reward_partial_score/std": 0.3930491805076599, "rewards/tag_count_reward/mean": -0.333984375, "rewards/tag_count_reward/std": 0.47209542989730835, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9809261560440063, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 422.0, "sampling/sampling_logp_difference/mean": 13.200485229492188, "step": 45 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.1455419659614563, "epoch": 0.07371794871794872, "grad_norm": 1122.5438232421875, "learning_rate": 1e-06, "loss": 0.2685, "step": 46 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.17444385588169098, "epoch": 0.07532051282051282, "grad_norm": 311.899169921875, "learning_rate": 1e-06, "loss": 0.2814, "step": 47 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.17196480184793472, "epoch": 0.07692307692307693, "grad_norm": 131.08229064941406, "learning_rate": 1e-06, "loss": 0.2203, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.41796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4885.0, "completions/mean_length": 7645.94921875, "completions/mean_terminated_length": 1370.97314453125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.14967508614063263, "epoch": 0.07852564102564102, "frac_reward_zero_std": 0.0, "grad_norm": 1408.8106689453125, "learning_rate": 1e-06, "loss": 0.2864, "num_tokens": 62251772.0, "reward": 0.27286070585250854, "reward_std": 0.2150975465774536, "rewards/progression_diversity/mean": -0.11773853003978729, "rewards/progression_diversity/std": 0.15623939037322998, "rewards/symbolic_reward_accuracy/mean": 0.251953125, "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, "rewards/symbolic_reward_partial_score/mean": 0.5332520008087158, "rewards/symbolic_reward_partial_score/std": 0.40305641293525696, "rewards/tag_count_reward/mean": -0.37109375, "rewards/tag_count_reward/std": 0.4835699498653412, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9803931713104248, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 424.0, "sampling/sampling_logp_difference/mean": 13.439233779907227, "step": 49 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.1299237497150898, "epoch": 0.08012820512820513, "grad_norm": 226.3897705078125, "learning_rate": 1e-06, "loss": 0.3439, "step": 50 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.14567925781011581, "epoch": 0.08173076923076923, "grad_norm": 225.25894165039062, "learning_rate": 1e-06, "loss": 0.227, "step": 51 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.16078957170248032, "epoch": 0.08333333333333333, "grad_norm": 259.5469665527344, "learning_rate": 1e-06, "loss": 0.2553, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.384765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14739.0, "completions/mean_length": 7208.03515625, "completions/mean_terminated_length": 1469.4158935546875, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.16834494471549988, "epoch": 0.08493589743589744, "frac_reward_zero_std": 0.03125, "grad_norm": 538.5927734375, "learning_rate": 1e-06, "loss": 0.2477, "num_tokens": 66784174.0, "reward": 0.3087325990200043, "reward_std": 0.2367071658372879, "rewards/progression_diversity/mean": -0.1125800758600235, "rewards/progression_diversity/std": 0.15995118021965027, "rewards/symbolic_reward_accuracy/mean": 0.2890625, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.5712727904319763, "rewards/symbolic_reward_partial_score/std": 0.4000099301338196, "rewards/tag_count_reward/mean": -0.349609375, "rewards/tag_count_reward/std": 0.47731292247772217, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9840480089187622, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 428.0, "sampling/sampling_logp_difference/mean": 12.845325469970703, "step": 53 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.15644784271717072, "epoch": 0.08653846153846154, "grad_norm": 271.4656677246094, "learning_rate": 1e-06, "loss": 0.2644, "step": 54 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.1287785843014717, "epoch": 0.08814102564102565, "grad_norm": 238.35011291503906, "learning_rate": 1e-06, "loss": 0.2856, "step": 55 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.15913260728120804, "epoch": 0.08974358974358974, "grad_norm": 207.0217742919922, "learning_rate": 1e-06, "loss": 0.3107, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.44140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6438.0, "completions/mean_length": 8034.2265625, "completions/mean_terminated_length": 1436.15380859375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.14069371670484543, "epoch": 0.09134615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 261.5517578125, "learning_rate": 1e-06, "loss": 0.2341, "num_tokens": 71809458.0, "reward": 0.28543901443481445, "reward_std": 0.24921724200248718, "rewards/progression_diversity/mean": -0.13432222604751587, "rewards/progression_diversity/std": 0.17075546085834503, "rewards/symbolic_reward_accuracy/mean": 0.2734375, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.5334147214889526, "rewards/symbolic_reward_partial_score/std": 0.41190940141677856, "rewards/tag_count_reward/mean": -0.373046875, "rewards/tag_count_reward/std": 0.48408737778663635, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9736584424972534, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 432.0, "sampling/sampling_logp_difference/mean": 14.813698768615723, "step": 57 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.1330208107829094, "epoch": 0.09294871794871795, "grad_norm": 234.9654541015625, "learning_rate": 1e-06, "loss": 0.2835, "step": 58 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.4140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.13570880889892578, "epoch": 0.09455128205128205, "grad_norm": 195.11659240722656, "learning_rate": 1e-06, "loss": 0.2919, "step": 59 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.453125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.12034280598163605, "epoch": 0.09615384615384616, "grad_norm": 103.8945083618164, "learning_rate": 1e-06, "loss": 0.3713, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16376.0, "completions/mean_length": 7816.056640625, "completions/mean_terminated_length": 1563.773681640625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.1452302411198616, "epoch": 0.09775641025641026, "frac_reward_zero_std": 0.03125, "grad_norm": 524.85205078125, "learning_rate": 1e-06, "loss": 0.203, "num_tokens": 76649167.0, "reward": 0.2318515032529831, "reward_std": 0.18006575107574463, "rewards/progression_diversity/mean": -0.13223427534103394, "rewards/progression_diversity/std": 0.17053887248039246, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.5187825560569763, "rewards/symbolic_reward_partial_score/std": 0.3892955183982849, "rewards/tag_count_reward/mean": -0.361328125, "rewards/tag_count_reward/std": 0.48085519671440125, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9811015129089355, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 434.0, "sampling/sampling_logp_difference/mean": 13.738810539245605, "step": 61 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.1534811109304428, "epoch": 0.09935897435897435, "grad_norm": 12.576883316040039, "learning_rate": 1e-06, "loss": 0.276, "step": 62 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.14197808876633644, "epoch": 0.10096153846153846, "grad_norm": 69.41390991210938, "learning_rate": 1e-06, "loss": 0.3, "step": 63 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.15332899242639542, "epoch": 0.10256410256410256, "grad_norm": 46.41984939575195, "learning_rate": 1e-06, "loss": 0.3122, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4021.0, "completions/mean_length": 7709.984375, "completions/mean_terminated_length": 1380.29736328125, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "entropy": 0.11576578766107559, "epoch": 0.10416666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 849.912353515625, "learning_rate": 1e-06, "loss": 0.3962, "num_tokens": 81463175.0, "reward": 0.2805628180503845, "reward_std": 0.22776535153388977, "rewards/progression_diversity/mean": -0.13366013765335083, "rewards/progression_diversity/std": 0.1738085299730301, "rewards/symbolic_reward_accuracy/mean": 0.267578125, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.530810534954071, "rewards/symbolic_reward_partial_score/std": 0.40938085317611694, "rewards/tag_count_reward/mean": -0.37890625, "rewards/tag_count_reward/std": 0.4855891764163971, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9786410331726074, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 436.0, "sampling/sampling_logp_difference/mean": 14.30560302734375, "step": 65 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.15666405856609344, "epoch": 0.10576923076923077, "grad_norm": 67.19806671142578, "learning_rate": 1e-06, "loss": 0.2378, "step": 66 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.14105742424726486, "epoch": 0.10737179487179487, "grad_norm": 82.8458023071289, "learning_rate": 1e-06, "loss": 0.3127, "step": 67 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.14606218039989471, "epoch": 0.10897435897435898, "grad_norm": 113.2381362915039, "learning_rate": 1e-06, "loss": 0.2326, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5696.0, "completions/mean_length": 7631.79296875, "completions/mean_terminated_length": 1446.9000244140625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.15287064760923386, "epoch": 0.11057692307692307, "frac_reward_zero_std": 0.0625, "grad_norm": 262.7812194824219, "learning_rate": 1e-06, "loss": 0.2887, "num_tokens": 86220381.0, "reward": 0.29364585876464844, "reward_std": 0.19495844841003418, "rewards/progression_diversity/mean": -0.13199639320373535, "rewards/progression_diversity/std": 0.17195309698581696, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.5769693851470947, "rewards/symbolic_reward_partial_score/std": 0.39537864923477173, "rewards/tag_count_reward/mean": -0.375, "rewards/tag_count_reward/std": 0.4845963716506958, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9830174446105957, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 436.0, "sampling/sampling_logp_difference/mean": 13.659445762634277, "step": 69 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.1318165846168995, "epoch": 0.11217948717948718, "grad_norm": 550.3621826171875, "learning_rate": 1e-06, "loss": 0.3745, "step": 70 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.1556747555732727, "epoch": 0.11378205128205128, "grad_norm": 144.26145935058594, "learning_rate": 1e-06, "loss": 0.2036, "step": 71 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.3671875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.1447812095284462, "epoch": 0.11538461538461539, "grad_norm": 35.39904022216797, "learning_rate": 1e-06, "loss": 0.2419, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.431640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8566.0, "completions/mean_length": 7925.490234375, "completions/mean_terminated_length": 1501.673583984375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "entropy": 0.1336100473999977, "epoch": 0.11698717948717949, "frac_reward_zero_std": 0.0, "grad_norm": 890.0448608398438, "learning_rate": 1e-06, "loss": 0.3014, "num_tokens": 91139272.0, "reward": 0.23745985329151154, "reward_std": 0.2408495545387268, "rewards/progression_diversity/mean": -0.14415252208709717, "rewards/progression_diversity/std": 0.18349310755729675, "rewards/symbolic_reward_accuracy/mean": 0.1953125, "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, "rewards/symbolic_reward_partial_score/mean": 0.5307128429412842, "rewards/symbolic_reward_partial_score/std": 0.38882943987846375, "rewards/tag_count_reward/mean": -0.375, "rewards/tag_count_reward/std": 0.4845963716506958, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9764099717140198, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 438.0, "sampling/sampling_logp_difference/mean": 14.951321601867676, "step": 73 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.13262945413589478, "epoch": 0.11858974358974358, "grad_norm": 236.77122497558594, "learning_rate": 1e-06, "loss": 0.3062, "step": 74 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.4375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6015625, "entropy": 0.12846176326274872, "epoch": 0.1201923076923077, "grad_norm": 424.5876159667969, "learning_rate": 1e-06, "loss": 0.3248, "step": 75 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.1426731012761593, "epoch": 0.12179487179487179, "grad_norm": 137.90576171875, "learning_rate": 1e-06, "loss": 0.2592, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.400390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6630.0, "completions/mean_length": 7431.486328125, "completions/mean_terminated_length": 1453.4234619140625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.1482650637626648, "epoch": 0.1233974358974359, "frac_reward_zero_std": 0.0, "grad_norm": 503.6921691894531, "learning_rate": 1e-06, "loss": 0.309, "num_tokens": 95835905.0, "reward": 0.31015902757644653, "reward_std": 0.23790916800498962, "rewards/progression_diversity/mean": -0.12814107537269592, "rewards/progression_diversity/std": 0.17228002846240997, "rewards/symbolic_reward_accuracy/mean": 0.294921875, "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, "rewards/symbolic_reward_partial_score/mean": 0.5648274421691895, "rewards/symbolic_reward_partial_score/std": 0.4122779667377472, "rewards/tag_count_reward/mean": -0.349609375, "rewards/tag_count_reward/std": 0.47731292247772217, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9831830263137817, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 440.0, "sampling/sampling_logp_difference/mean": 13.701435089111328, "step": 77 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.15608947724103928, "epoch": 0.125, "grad_norm": 502.0856628417969, "learning_rate": 1e-06, "loss": 0.2134, "step": 78 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.1563669890165329, "epoch": 0.1266025641025641, "grad_norm": 40.49258804321289, "learning_rate": 1e-06, "loss": 0.2021, "step": 79 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6640625, "entropy": 0.13854920864105225, "epoch": 0.1282051282051282, "grad_norm": 24.90146255493164, "learning_rate": 1e-06, "loss": 0.3625, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.376953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8946.0, "completions/mean_length": 7095.765625, "completions/mean_terminated_length": 1476.2381591796875, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "entropy": 0.1516287624835968, "epoch": 0.12980769230769232, "frac_reward_zero_std": 0.0, "grad_norm": 1240.288818359375, "learning_rate": 1e-06, "loss": 0.3117, "num_tokens": 100263337.0, "reward": 0.2255856692790985, "reward_std": 0.19480521976947784, "rewards/progression_diversity/mean": -0.12746772170066833, "rewards/progression_diversity/std": 0.17732733488082886, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.5641438961029053, "rewards/symbolic_reward_partial_score/std": 0.36627063155174255, "rewards/tag_count_reward/mean": -0.349609375, "rewards/tag_count_reward/std": 0.47731292247772217, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9782713651657104, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 440.0, "sampling/sampling_logp_difference/mean": 14.690707206726074, "step": 81 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.17118209600448608, "epoch": 0.13141025641025642, "grad_norm": 143.5312042236328, "learning_rate": 1e-06, "loss": 0.2574, "step": 82 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.14265629649162292, "epoch": 0.1330128205128205, "grad_norm": 2867.10009765625, "learning_rate": 1e-06, "loss": 0.4231, "step": 83 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.15464875847101212, "epoch": 0.1346153846153846, "grad_norm": 54.51691436767578, "learning_rate": 1e-06, "loss": 0.2703, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.48046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3804.0, "completions/mean_length": 8625.69921875, "completions/mean_terminated_length": 1450.7293701171875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.13001390919089317, "epoch": 0.1362179487179487, "frac_reward_zero_std": 0.0, "grad_norm": 455.9178161621094, "learning_rate": 1e-06, "loss": 0.222, "num_tokens": 105565695.0, "reward": 0.1887899488210678, "reward_std": 0.2059934437274933, "rewards/progression_diversity/mean": -0.16299909353256226, "rewards/progression_diversity/std": 0.18564143776893616, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.47913411259651184, "rewards/symbolic_reward_partial_score/std": 0.38510748744010925, "rewards/tag_count_reward/mean": -0.447265625, "rewards/tag_count_reward/std": 0.4976975917816162, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9722630977630615, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 442.0, "sampling/sampling_logp_difference/mean": 15.671956062316895, "step": 85 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.1201215460896492, "epoch": 0.13782051282051283, "grad_norm": 307.8591003417969, "learning_rate": 1e-06, "loss": 0.3693, "step": 86 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.12688108533620834, "epoch": 0.13942307692307693, "grad_norm": 181.98202514648438, "learning_rate": 1e-06, "loss": 0.3682, "step": 87 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.4140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.12507511675357819, "epoch": 0.14102564102564102, "grad_norm": 60.04415512084961, "learning_rate": 1e-06, "loss": 0.3179, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4394.0, "completions/mean_length": 7166.556640625, "completions/mean_terminated_length": 1449.4083251953125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.16381006687879562, "epoch": 0.14262820512820512, "frac_reward_zero_std": 0.0, "grad_norm": 344.05682373046875, "learning_rate": 1e-06, "loss": 0.2595, "num_tokens": 109989372.0, "reward": 0.28582632541656494, "reward_std": 0.21509969234466553, "rewards/progression_diversity/mean": -0.12635371088981628, "rewards/progression_diversity/std": 0.17559568583965302, "rewards/symbolic_reward_accuracy/mean": 0.255859375, "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, "rewards/symbolic_reward_partial_score/mean": 0.5617838501930237, "rewards/symbolic_reward_partial_score/std": 0.3981553614139557, "rewards/tag_count_reward/mean": -0.349609375, "rewards/tag_count_reward/std": 0.47731292247772217, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9799978733062744, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 444.0, "sampling/sampling_logp_difference/mean": 14.372432708740234, "step": 89 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.14315053820610046, "epoch": 0.14423076923076922, "grad_norm": 354.8428955078125, "learning_rate": 1e-06, "loss": 0.3071, "step": 90 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.16275375336408615, "epoch": 0.14583333333333334, "grad_norm": 645.2577514648438, "learning_rate": 1e-06, "loss": 0.3115, "step": 91 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.16187621653079987, "epoch": 0.14743589743589744, "grad_norm": 0.025196930393576622, "learning_rate": 1e-06, "loss": 0.2797, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.333984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4512.0, "completions/mean_length": 6437.927734375, "completions/mean_terminated_length": 1450.307861328125, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "entropy": 0.18978476524353027, "epoch": 0.14903846153846154, "frac_reward_zero_std": 0.03125, "grad_norm": 520.9977416992188, "learning_rate": 1e-06, "loss": 0.1873, "num_tokens": 114084839.0, "reward": 0.3511761724948883, "reward_std": 0.209753155708313, "rewards/progression_diversity/mean": -0.11236479878425598, "rewards/progression_diversity/std": 0.17498160898685455, "rewards/symbolic_reward_accuracy/mean": 0.326171875, "rewards/symbolic_reward_accuracy/std": 0.4692695140838623, "rewards/symbolic_reward_partial_score/mean": 0.6183431148529053, "rewards/symbolic_reward_partial_score/std": 0.38944968581199646, "rewards/tag_count_reward/mean": -0.2890625, "rewards/tag_count_reward/std": 0.45377036929130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9891281127929688, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 444.0, "sampling/sampling_logp_difference/mean": 12.81410026550293, "step": 93 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.16329990327358246, "epoch": 0.15064102564102563, "grad_norm": 115.51496887207031, "learning_rate": 1e-06, "loss": 0.305, "step": 94 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.17047469317913055, "epoch": 0.15224358974358973, "grad_norm": 48.81769561767578, "learning_rate": 1e-06, "loss": 0.235, "step": 95 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6328125, "entropy": 0.16202182322740555, "epoch": 0.15384615384615385, "grad_norm": 21.413990020751953, "learning_rate": 1e-06, "loss": 0.2707, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.45703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4237.0, "completions/mean_length": 8286.236328125, "completions/mean_terminated_length": 1470.1331787109375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.12126778811216354, "epoch": 0.15544871794871795, "frac_reward_zero_std": 0.0, "grad_norm": 1298.6553955078125, "learning_rate": 1e-06, "loss": 0.3518, "num_tokens": 119384688.0, "reward": 0.15234431624412537, "reward_std": 0.18723168969154358, "rewards/progression_diversity/mean": -0.1576594114303589, "rewards/progression_diversity/std": 0.18727099895477295, "rewards/symbolic_reward_accuracy/mean": 0.099609375, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.44340819120407104, "rewards/symbolic_reward_partial_score/std": 0.3761260211467743, "rewards/tag_count_reward/mean": -0.388671875, "rewards/tag_count_reward/std": 0.4879252314567566, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9738816022872925, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 446.0, "sampling/sampling_logp_difference/mean": 15.43978500366211, "step": 97 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.1406988874077797, "epoch": 0.15705128205128205, "grad_norm": 550.3086547851562, "learning_rate": 1e-06, "loss": 0.2552, "step": 98 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6328125, "entropy": 0.13929974660277367, "epoch": 0.15865384615384615, "grad_norm": 161.8490447998047, "learning_rate": 1e-06, "loss": 0.2264, "step": 99 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.4609375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.71875, "entropy": 0.11794158071279526, "epoch": 0.16025641025641027, "grad_norm": 22.501441955566406, "learning_rate": 1e-06, "loss": 0.4061, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.392578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8333.0, "completions/mean_length": 7378.84375, "completions/mean_terminated_length": 1558.791015625, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.14221202209591866, "epoch": 0.16185897435897437, "frac_reward_zero_std": 0.0, "grad_norm": 3660.10693359375, "learning_rate": 1e-06, "loss": 0.409, "num_tokens": 124050112.0, "reward": 0.18008412420749664, "reward_std": 0.16677263379096985, "rewards/progression_diversity/mean": -0.14051449298858643, "rewards/progression_diversity/std": 0.18938302993774414, "rewards/symbolic_reward_accuracy/mean": 0.115234375, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.4884277284145355, "rewards/symbolic_reward_partial_score/std": 0.37571844458580017, "rewards/tag_count_reward/mean": -0.341796875, "rewards/tag_count_reward/std": 0.4747757613658905, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.976874589920044, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 448.0, "sampling/sampling_logp_difference/mean": 15.273395538330078, "step": 101 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.1293311044573784, "epoch": 0.16346153846153846, "grad_norm": 400.09881591796875, "learning_rate": 1e-06, "loss": 0.3765, "step": 102 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.1685006469488144, "epoch": 0.16506410256410256, "grad_norm": 77.7186050415039, "learning_rate": 1e-06, "loss": 0.2434, "step": 103 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6796875, "entropy": 0.16083669662475586, "epoch": 0.16666666666666666, "grad_norm": 67.05364990234375, "learning_rate": 1e-06, "loss": 0.2542, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.330078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8690.0, "completions/mean_length": 6384.462890625, "completions/mean_terminated_length": 1457.5772705078125, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "entropy": 0.17285646498203278, "epoch": 0.16826923076923078, "frac_reward_zero_std": 0.03125, "grad_norm": 1558.9949951171875, "learning_rate": 1e-06, "loss": 0.1558, "num_tokens": 128130573.0, "reward": 0.2611575722694397, "reward_std": 0.21417993307113647, "rewards/progression_diversity/mean": -0.116178959608078, "rewards/progression_diversity/std": 0.17685534060001373, "rewards/symbolic_reward_accuracy/mean": 0.197265625, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.5794758796691895, "rewards/symbolic_reward_partial_score/std": 0.3689756989479065, "rewards/tag_count_reward/mean": -0.298828125, "rewards/tag_count_reward/std": 0.45819199085235596, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9883815050125122, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 452.0, "sampling/sampling_logp_difference/mean": 13.416872024536133, "step": 105 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.1663920134305954, "epoch": 0.16987179487179488, "grad_norm": 319.74737548828125, "learning_rate": 1e-06, "loss": 0.2595, "step": 106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.18017150461673737, "epoch": 0.17147435897435898, "grad_norm": 447.3372497558594, "learning_rate": 1e-06, "loss": 0.313, "step": 107 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.182367742061615, "epoch": 0.17307692307692307, "grad_norm": 80.31652069091797, "learning_rate": 1e-06, "loss": 0.2265, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.400390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5179.0, "completions/mean_length": 7449.263671875, "completions/mean_terminated_length": 1483.0716552734375, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.1524367406964302, "epoch": 0.17467948717948717, "frac_reward_zero_std": 0.03125, "grad_norm": 3725.33349609375, "learning_rate": 1e-06, "loss": 0.3077, "num_tokens": 132834852.0, "reward": 0.24695852398872375, "reward_std": 0.17919716238975525, "rewards/progression_diversity/mean": -0.14106225967407227, "rewards/progression_diversity/std": 0.19025002419948578, "rewards/symbolic_reward_accuracy/mean": 0.20703125, "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, "rewards/symbolic_reward_partial_score/mean": 0.5316731333732605, "rewards/symbolic_reward_partial_score/std": 0.3761518597602844, "rewards/tag_count_reward/mean": -0.353515625, "rewards/tag_count_reward/std": 0.47852855920791626, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9842573404312134, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 456.0, "sampling/sampling_logp_difference/mean": 14.218891143798828, "step": 109 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.14695103839039803, "epoch": 0.1762820512820513, "grad_norm": 219.70684814453125, "learning_rate": 1e-06, "loss": 0.2567, "step": 110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.16737890988588333, "epoch": 0.1778846153846154, "grad_norm": 367.99285888671875, "learning_rate": 1e-06, "loss": 0.2145, "step": 111 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.1623668521642685, "epoch": 0.1794871794871795, "grad_norm": 359.1845703125, "learning_rate": 1e-06, "loss": 0.3114, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.298828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5083.0, "completions/mean_length": 5895.5625, "completions/mean_terminated_length": 1425.559814453125, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "entropy": 0.17023801803588867, "epoch": 0.18108974358974358, "frac_reward_zero_std": 0.03125, "grad_norm": 908.6353149414062, "learning_rate": 1e-06, "loss": 0.2684, "num_tokens": 136701844.0, "reward": 0.3186452388763428, "reward_std": 0.20103424787521362, "rewards/progression_diversity/mean": -0.1164332926273346, "rewards/progression_diversity/std": 0.18981561064720154, "rewards/symbolic_reward_accuracy/mean": 0.27734375, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.5979329347610474, "rewards/symbolic_reward_partial_score/std": 0.38331446051597595, "rewards/tag_count_reward/mean": -0.259765625, "rewards/tag_count_reward/std": 0.4389347732067108, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9887030124664307, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 456.0, "sampling/sampling_logp_difference/mean": 13.705371856689453, "step": 113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.19520440697669983, "epoch": 0.18269230769230768, "grad_norm": 1410.1068115234375, "learning_rate": 1e-06, "loss": 0.2648, "step": 114 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.20632527768611908, "epoch": 0.1842948717948718, "grad_norm": 189.16241455078125, "learning_rate": 1e-06, "loss": 0.2618, "step": 115 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.18539946526288986, "epoch": 0.1858974358974359, "grad_norm": 1.3296606540679932, "learning_rate": 1e-06, "loss": 0.2421, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13409.0, "completions/mean_length": 6352.482421875, "completions/mean_terminated_length": 1453.369140625, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "entropy": 0.17278361320495605, "epoch": 0.1875, "frac_reward_zero_std": 0.03125, "grad_norm": 269.99188232421875, "learning_rate": 1e-06, "loss": 0.2609, "num_tokens": 140781531.0, "reward": 0.28726375102996826, "reward_std": 0.20572300255298615, "rewards/progression_diversity/mean": -0.1266537606716156, "rewards/progression_diversity/std": 0.19276343286037445, "rewards/symbolic_reward_accuracy/mean": 0.236328125, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.5861165523529053, "rewards/symbolic_reward_partial_score/std": 0.37256380915641785, "rewards/tag_count_reward/mean": -0.291015625, "rewards/tag_count_reward/std": 0.45467492938041687, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9863637089729309, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 460.0, "sampling/sampling_logp_difference/mean": 14.210054397583008, "step": 117 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.1698644682765007, "epoch": 0.1891025641025641, "grad_norm": 174.22406005859375, "learning_rate": 1e-06, "loss": 0.2695, "step": 118 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.17513630539178848, "epoch": 0.1907051282051282, "grad_norm": 80.73851013183594, "learning_rate": 1e-06, "loss": 0.2233, "step": 119 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.1907574087381363, "epoch": 0.19230769230769232, "grad_norm": 10.501818656921387, "learning_rate": 1e-06, "loss": 0.2534, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12543.0, "completions/mean_length": 5693.0859375, "completions/mean_terminated_length": 1428.404296875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.20927876979112625, "epoch": 0.19391025641025642, "frac_reward_zero_std": 0.0, "grad_norm": 1835.8458251953125, "learning_rate": 1e-06, "loss": 0.2083, "num_tokens": 144458791.0, "reward": 0.3568437099456787, "reward_std": 0.23056712746620178, "rewards/progression_diversity/mean": -0.11690312623977661, "rewards/progression_diversity/std": 0.19476282596588135, "rewards/symbolic_reward_accuracy/mean": 0.326171875, "rewards/symbolic_reward_accuracy/std": 0.4692695140838623, "rewards/symbolic_reward_partial_score/mean": 0.6289225220680237, "rewards/symbolic_reward_partial_score/std": 0.3799564838409424, "rewards/tag_count_reward/mean": -0.263671875, "rewards/tag_count_reward/std": 0.4410543739795685, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9867503643035889, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 462.0, "sampling/sampling_logp_difference/mean": 14.30959415435791, "step": 121 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.19147824496030807, "epoch": 0.1955128205128205, "grad_norm": 149.43600463867188, "learning_rate": 1e-06, "loss": 0.2888, "step": 122 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.19553864002227783, "epoch": 0.1971153846153846, "grad_norm": 161.30584716796875, "learning_rate": 1e-06, "loss": 0.2042, "step": 123 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.1698673516511917, "epoch": 0.1987179487179487, "grad_norm": 62.17693328857422, "learning_rate": 1e-06, "loss": 0.3143, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.349609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5310.0, "completions/mean_length": 6637.630859375, "completions/mean_terminated_length": 1398.5916748046875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.15772393345832825, "epoch": 0.20032051282051283, "frac_reward_zero_std": 0.0, "grad_norm": 361.6252136230469, "learning_rate": 1e-06, "loss": 0.2752, "num_tokens": 148787658.0, "reward": 0.2472916841506958, "reward_std": 0.20401528477668762, "rewards/progression_diversity/mean": -0.13557906448841095, "rewards/progression_diversity/std": 0.1981465220451355, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.5566895008087158, "rewards/symbolic_reward_partial_score/std": 0.3679920732975006, "rewards/tag_count_reward/mean": -0.30859375, "rewards/tag_count_reward/std": 0.4623647928237915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9800022840499878, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 462.0, "sampling/sampling_logp_difference/mean": 15.337078094482422, "step": 125 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.17300068587064743, "epoch": 0.20192307692307693, "grad_norm": 195.64173889160156, "learning_rate": 1e-06, "loss": 0.2278, "step": 126 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.14945074170827866, "epoch": 0.20352564102564102, "grad_norm": 388.9164733886719, "learning_rate": 1e-06, "loss": 0.3515, "step": 127 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.3515625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.640625, "entropy": 0.1672022044658661, "epoch": 0.20512820512820512, "grad_norm": 10.722322463989258, "learning_rate": 1e-06, "loss": 0.2494, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.341796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4674.0, "completions/mean_length": 6536.28125, "completions/mean_terminated_length": 1422.480712890625, "completions/min_length": 355.0, "completions/min_terminated_length": 355.0, "entropy": 0.16759486496448517, "epoch": 0.20673076923076922, "frac_reward_zero_std": 0.03125, "grad_norm": 282.8157043457031, "learning_rate": 1e-06, "loss": 0.2464, "num_tokens": 153045402.0, "reward": 0.29630646109580994, "reward_std": 0.23296579718589783, "rewards/progression_diversity/mean": -0.1359575092792511, "rewards/progression_diversity/std": 0.20307843387126923, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.555371105670929, "rewards/symbolic_reward_partial_score/std": 0.40097343921661377, "rewards/tag_count_reward/mean": -0.283203125, "rewards/tag_count_reward/std": 0.4509948492050171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9835497140884399, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 464.0, "sampling/sampling_logp_difference/mean": 14.896586418151855, "step": 129 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6328125, "entropy": 0.1613958775997162, "epoch": 0.20833333333333334, "grad_norm": 273.4793701171875, "learning_rate": 1e-06, "loss": 0.2878, "step": 130 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.17489013820886612, "epoch": 0.20993589743589744, "grad_norm": 54.437557220458984, "learning_rate": 1e-06, "loss": 0.2721, "step": 131 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.16963838040828705, "epoch": 0.21153846153846154, "grad_norm": 174.172119140625, "learning_rate": 1e-06, "loss": 0.2341, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.365234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14442.0, "completions/mean_length": 7063.861328125, "completions/mean_terminated_length": 1701.1968994140625, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.14898524433374405, "epoch": 0.21314102564102563, "frac_reward_zero_std": 0.0, "grad_norm": 1326.01611328125, "learning_rate": 1e-06, "loss": 0.3208, "num_tokens": 157607635.0, "reward": 0.22771768271923065, "reward_std": 0.22894077003002167, "rewards/progression_diversity/mean": -0.14180660247802734, "rewards/progression_diversity/std": 0.19994327425956726, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.5255045294761658, "rewards/symbolic_reward_partial_score/std": 0.3672705292701721, "rewards/tag_count_reward/mean": -0.328125, "rewards/tag_count_reward/std": 0.4699897766113281, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9785855412483215, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 464.0, "sampling/sampling_logp_difference/mean": 15.50029182434082, "step": 133 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.14145556092262268, "epoch": 0.21474358974358973, "grad_norm": 567.6675415039062, "learning_rate": 1e-06, "loss": 0.3571, "step": 134 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.15257297456264496, "epoch": 0.21634615384615385, "grad_norm": 3.565154552459717, "learning_rate": 1e-06, "loss": 0.2647, "step": 135 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.16691310703754425, "epoch": 0.21794871794871795, "grad_norm": 91.63186645507812, "learning_rate": 1e-06, "loss": 0.2332, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.349609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3840.0, "completions/mean_length": 6634.4609375, "completions/mean_terminated_length": 1393.7177734375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.16496124863624573, "epoch": 0.21955128205128205, "frac_reward_zero_std": 0.03125, "grad_norm": 526.1981201171875, "learning_rate": 1e-06, "loss": 0.289, "num_tokens": 161836447.0, "reward": 0.20310071110725403, "reward_std": 0.19233137369155884, "rewards/progression_diversity/mean": -0.14451980590820312, "rewards/progression_diversity/std": 0.2100125253200531, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.5359863042831421, "rewards/symbolic_reward_partial_score/std": 0.3669736087322235, "rewards/tag_count_reward/mean": -0.3125, "rewards/tag_count_reward/std": 0.4639657139778137, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9792190790176392, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 468.0, "sampling/sampling_logp_difference/mean": 15.816644668579102, "step": 137 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.16982516646385193, "epoch": 0.22115384615384615, "grad_norm": 759.0693359375, "learning_rate": 1e-06, "loss": 0.4166, "step": 138 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6015625, "entropy": 0.16247816383838654, "epoch": 0.22275641025641027, "grad_norm": 522.9348754882812, "learning_rate": 1e-06, "loss": 0.2726, "step": 139 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.640625, "entropy": 0.1550752818584442, "epoch": 0.22435897435897437, "grad_norm": 4.066892147064209, "learning_rate": 1e-06, "loss": 0.3117, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.29296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7450.0, "completions/mean_length": 5863.76171875, "completions/mean_terminated_length": 1504.5469970703125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.1805713176727295, "epoch": 0.22596153846153846, "frac_reward_zero_std": 0.03125, "grad_norm": 949.1548461914062, "learning_rate": 1e-06, "loss": 0.2557, "num_tokens": 165682309.0, "reward": 0.20798048377037048, "reward_std": 0.1601024717092514, "rewards/progression_diversity/mean": -0.11650343239307404, "rewards/progression_diversity/std": 0.19056853652000427, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.5506673455238342, "rewards/symbolic_reward_partial_score/std": 0.35328954458236694, "rewards/tag_count_reward/mean": -0.263671875, "rewards/tag_count_reward/std": 0.4410543739795685, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906647205352783, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 468.0, "sampling/sampling_logp_difference/mean": 13.65820598602295, "step": 141 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.19785359501838684, "epoch": 0.22756410256410256, "grad_norm": 413.5582580566406, "learning_rate": 1e-06, "loss": 0.3065, "step": 142 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.625, "entropy": 0.1836673691868782, "epoch": 0.22916666666666666, "grad_norm": 538.4183959960938, "learning_rate": 1e-06, "loss": 0.2451, "step": 143 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.19951891899108887, "epoch": 0.23076923076923078, "grad_norm": 140.01963806152344, "learning_rate": 1e-06, "loss": 0.2766, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13188.0, "completions/mean_length": 6076.03515625, "completions/mean_terminated_length": 1559.061767578125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.18072357773780823, "epoch": 0.23237179487179488, "frac_reward_zero_std": 0.0625, "grad_norm": 477.57244873046875, "learning_rate": 1e-06, "loss": 0.2059, "num_tokens": 169643863.0, "reward": 0.2985851764678955, "reward_std": 0.21802590787410736, "rewards/progression_diversity/mean": -0.12000055611133575, "rewards/progression_diversity/std": 0.19343887269496918, "rewards/symbolic_reward_accuracy/mean": 0.248046875, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.5884765386581421, "rewards/symbolic_reward_partial_score/std": 0.3711862862110138, "rewards/tag_count_reward/mean": -0.255859375, "rewards/tag_count_reward/std": 0.43676990270614624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889644384384155, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 468.0, "sampling/sampling_logp_difference/mean": 13.810128211975098, "step": 145 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.16814905405044556, "epoch": 0.23397435897435898, "grad_norm": 629.6782836914062, "learning_rate": 1e-06, "loss": 0.3065, "step": 146 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.17732855677604675, "epoch": 0.23557692307692307, "grad_norm": 60.79619216918945, "learning_rate": 1e-06, "loss": 0.2682, "step": 147 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.18027547001838684, "epoch": 0.23717948717948717, "grad_norm": 11.454753875732422, "learning_rate": 1e-06, "loss": 0.2156, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.291015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8313.0, "completions/mean_length": 5886.9921875, "completions/mean_terminated_length": 1578.3031005859375, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.20559810101985931, "epoch": 0.2387820512820513, "frac_reward_zero_std": 0.0, "grad_norm": 932.6332397460938, "learning_rate": 1e-06, "loss": 0.1883, "num_tokens": 173516387.0, "reward": 0.26607826352119446, "reward_std": 0.2007283866405487, "rewards/progression_diversity/mean": -0.11971336603164673, "rewards/progression_diversity/std": 0.19882185757160187, "rewards/symbolic_reward_accuracy/mean": 0.201171875, "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, "rewards/symbolic_reward_partial_score/mean": 0.5738606452941895, "rewards/symbolic_reward_partial_score/std": 0.3670341968536377, "rewards/tag_count_reward/mean": -0.255859375, "rewards/tag_count_reward/std": 0.43676990270614624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9897212386131287, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 470.0, "sampling/sampling_logp_difference/mean": 13.849344253540039, "step": 149 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.18233749270439148, "epoch": 0.2403846153846154, "grad_norm": 219.96221923828125, "learning_rate": 1e-06, "loss": 0.3033, "step": 150 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.17845363169908524, "epoch": 0.2419871794871795, "grad_norm": 243.94744873046875, "learning_rate": 1e-06, "loss": 0.2881, "step": 151 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.17516274005174637, "epoch": 0.24358974358974358, "grad_norm": 35.456787109375, "learning_rate": 1e-06, "loss": 0.2953, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.302734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5891.0, "completions/mean_length": 5964.396484375, "completions/mean_terminated_length": 1440.47900390625, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.17081477493047714, "epoch": 0.24519230769230768, "frac_reward_zero_std": 0.0625, "grad_norm": 492.8033752441406, "learning_rate": 1e-06, "loss": 0.3071, "num_tokens": 177494126.0, "reward": 0.2438780814409256, "reward_std": 0.16874653100967407, "rewards/progression_diversity/mean": -0.12586447596549988, "rewards/progression_diversity/std": 0.202118381857872, "rewards/symbolic_reward_accuracy/mean": 0.169921875, "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, "rewards/symbolic_reward_partial_score/mean": 0.5612630844116211, "rewards/symbolic_reward_partial_score/std": 0.35993054509162903, "rewards/tag_count_reward/mean": -0.251953125, "rewards/tag_count_reward/std": 0.43455907702445984, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9910095930099487, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 472.0, "sampling/sampling_logp_difference/mean": 13.843090057373047, "step": 153 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.19071951508522034, "epoch": 0.2467948717948718, "grad_norm": 417.1937561035156, "learning_rate": 1e-06, "loss": 0.2772, "step": 154 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.18628785014152527, "epoch": 0.2483974358974359, "grad_norm": 27.67140007019043, "learning_rate": 1e-06, "loss": 0.2686, "step": 155 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.2085033506155014, "epoch": 0.25, "grad_norm": 0.0168951116502285, "learning_rate": 1e-06, "loss": 0.1791, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5164.0, "completions/mean_length": 5305.685546875, "completions/mean_terminated_length": 1496.6063232421875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.18942154943943024, "epoch": 0.2516025641025641, "frac_reward_zero_std": 0.03125, "grad_norm": 547.8964233398438, "learning_rate": 1e-06, "loss": 0.2312, "num_tokens": 181116349.0, "reward": 0.27441728115081787, "reward_std": 0.20713432133197784, "rewards/progression_diversity/mean": -0.10222004354000092, "rewards/progression_diversity/std": 0.18552517890930176, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.6082357168197632, "rewards/symbolic_reward_partial_score/std": 0.34247255325317383, "rewards/tag_count_reward/mean": -0.20703125, "rewards/tag_count_reward/std": 0.40557438135147095, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906232357025146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 472.0, "sampling/sampling_logp_difference/mean": 13.893712043762207, "step": 157 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.20169401168823242, "epoch": 0.2532051282051282, "grad_norm": 378.7434997558594, "learning_rate": 1e-06, "loss": 0.3347, "step": 158 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.20617859065532684, "epoch": 0.2548076923076923, "grad_norm": 51.51784133911133, "learning_rate": 1e-06, "loss": 0.2684, "step": 159 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.20408597588539124, "epoch": 0.2564102564102564, "grad_norm": 19.432321548461914, "learning_rate": 1e-06, "loss": 0.1783, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.306640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6080.0, "completions/mean_length": 6120.22265625, "completions/mean_terminated_length": 1581.0308837890625, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.17949867248535156, "epoch": 0.25801282051282054, "frac_reward_zero_std": 0.0, "grad_norm": 459.5679626464844, "learning_rate": 1e-06, "loss": 0.2659, "num_tokens": 185044895.0, "reward": 0.27867066860198975, "reward_std": 0.21932919323444366, "rewards/progression_diversity/mean": -0.12316776067018509, "rewards/progression_diversity/std": 0.19920648634433746, "rewards/symbolic_reward_accuracy/mean": 0.2265625, "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, "rewards/symbolic_reward_partial_score/mean": 0.5664713382720947, "rewards/symbolic_reward_partial_score/std": 0.3668426275253296, "rewards/tag_count_reward/mean": -0.259765625, "rewards/tag_count_reward/std": 0.4389347732067108, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9886577129364014, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 476.0, "sampling/sampling_logp_difference/mean": 14.280139923095703, "step": 161 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.17472190409898758, "epoch": 0.25961538461538464, "grad_norm": 1472.97216796875, "learning_rate": 1e-06, "loss": 0.2911, "step": 162 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.17166371643543243, "epoch": 0.26121794871794873, "grad_norm": 229.58348083496094, "learning_rate": 1e-06, "loss": 0.2621, "step": 163 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.20802847295999527, "epoch": 0.26282051282051283, "grad_norm": 183.23561096191406, "learning_rate": 1e-06, "loss": 0.2019, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.208984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6132.0, "completions/mean_length": 4608.12890625, "completions/mean_terminated_length": 1496.972900390625, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "entropy": 0.20654169470071793, "epoch": 0.2644230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 412.5071716308594, "learning_rate": 1e-06, "loss": 0.2796, "num_tokens": 188104865.0, "reward": 0.3180442452430725, "reward_std": 0.21282601356506348, "rewards/progression_diversity/mean": -0.08571553975343704, "rewards/progression_diversity/std": 0.17538563907146454, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.6339681148529053, "rewards/symbolic_reward_partial_score/std": 0.34711363911628723, "rewards/tag_count_reward/mean": -0.189453125, "rewards/tag_count_reward/std": 0.3922513723373413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9954339265823364, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 476.0, "sampling/sampling_logp_difference/mean": 13.250750541687012, "step": 165 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2085501253604889, "epoch": 0.266025641025641, "grad_norm": 412.70355224609375, "learning_rate": 1e-06, "loss": 0.2375, "step": 166 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.21886557340621948, "epoch": 0.2676282051282051, "grad_norm": 369.0177307128906, "learning_rate": 1e-06, "loss": 0.1979, "step": 167 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.2201170101761818, "epoch": 0.2692307692307692, "grad_norm": 0.01998511515557766, "learning_rate": 1e-06, "loss": 0.2208, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.24609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4548.0, "completions/mean_length": 5164.029296875, "completions/mean_terminated_length": 1501.5517578125, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "entropy": 0.19382669776678085, "epoch": 0.2708333333333333, "frac_reward_zero_std": 0.0, "grad_norm": 395.1278991699219, "learning_rate": 1e-06, "loss": 0.2256, "num_tokens": 191630160.0, "reward": 0.29971635341644287, "reward_std": 0.2082447111606598, "rewards/progression_diversity/mean": -0.09819044172763824, "rewards/progression_diversity/std": 0.18440598249435425, "rewards/symbolic_reward_accuracy/mean": 0.228515625, "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, "rewards/symbolic_reward_partial_score/mean": 0.614306628704071, "rewards/symbolic_reward_partial_score/std": 0.3599141538143158, "rewards/tag_count_reward/mean": -0.20703125, "rewards/tag_count_reward/std": 0.40557438135147095, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9958318471908569, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 480.0, "sampling/sampling_logp_difference/mean": 13.064840316772461, "step": 169 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.18345515429973602, "epoch": 0.2724358974358974, "grad_norm": 296.6773986816406, "learning_rate": 1e-06, "loss": 0.2652, "step": 170 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.65625, "entropy": 0.2129678875207901, "epoch": 0.27403846153846156, "grad_norm": 5.62185001373291, "learning_rate": 1e-06, "loss": 0.2223, "step": 171 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.2066056728363037, "epoch": 0.27564102564102566, "grad_norm": 7.7060699462890625, "learning_rate": 1e-06, "loss": 0.225, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.240234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4527.0, "completions/mean_length": 5134.7578125, "completions/mean_terminated_length": 1577.7994384765625, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "entropy": 0.18826917558908463, "epoch": 0.27724358974358976, "frac_reward_zero_std": 0.03125, "grad_norm": 683.8331298828125, "learning_rate": 1e-06, "loss": 0.2989, "num_tokens": 195087860.0, "reward": 0.2965736985206604, "reward_std": 0.2112705409526825, "rewards/progression_diversity/mean": -0.09214277565479279, "rewards/progression_diversity/std": 0.17536333203315735, "rewards/symbolic_reward_accuracy/mean": 0.224609375, "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, "rewards/symbolic_reward_partial_score/mean": 0.6068847179412842, "rewards/symbolic_reward_partial_score/std": 0.3553755581378937, "rewards/tag_count_reward/mean": -0.193359375, "rewards/tag_count_reward/std": 0.39531853795051575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0022835731506348, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 480.0, "sampling/sampling_logp_difference/mean": 11.84273910522461, "step": 173 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.20123252272605896, "epoch": 0.27884615384615385, "grad_norm": 212.23048400878906, "learning_rate": 1e-06, "loss": 0.2336, "step": 174 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.20349763333797455, "epoch": 0.28044871794871795, "grad_norm": 150.0238037109375, "learning_rate": 1e-06, "loss": 0.2346, "step": 175 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.22359148412942886, "epoch": 0.28205128205128205, "grad_norm": 88.31288146972656, "learning_rate": 1e-06, "loss": 0.1458, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12719.0, "completions/mean_length": 5528.5, "completions/mean_terminated_length": 1602.04248046875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "entropy": 0.19052383303642273, "epoch": 0.28365384615384615, "frac_reward_zero_std": 0.0, "grad_norm": 955.2317504882812, "learning_rate": 1e-06, "loss": 0.241, "num_tokens": 198768004.0, "reward": 0.2399653196334839, "reward_std": 0.19414269924163818, "rewards/progression_diversity/mean": -0.10552017390727997, "rewards/progression_diversity/std": 0.18494197726249695, "rewards/symbolic_reward_accuracy/mean": 0.16015625, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.5618652105331421, "rewards/symbolic_reward_partial_score/std": 0.350735604763031, "rewards/tag_count_reward/mean": -0.236328125, "rewards/tag_count_reward/std": 0.42524150013923645, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.989559531211853, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 482.0, "sampling/sampling_logp_difference/mean": 14.195697784423828, "step": 177 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.17670485377311707, "epoch": 0.28525641025641024, "grad_norm": 969.2748413085938, "learning_rate": 1e-06, "loss": 0.3582, "step": 178 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.18852558732032776, "epoch": 0.28685897435897434, "grad_norm": 0.01699661649763584, "learning_rate": 1e-06, "loss": 0.2577, "step": 179 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6015625, "entropy": 0.19296500831842422, "epoch": 0.28846153846153844, "grad_norm": 0.020286090672016144, "learning_rate": 1e-06, "loss": 0.2662, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16384.0, "completions/max_terminated_length": 5431.0, "completions/mean_length": 5255.9375, "completions/mean_terminated_length": 1546.5833740234375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.1944911777973175, "epoch": 0.2900641025641026, "frac_reward_zero_std": 0.03125, "grad_norm": 478.534423828125, "learning_rate": 1e-06, "loss": 0.2062, "num_tokens": 202400020.0, "reward": 0.23154094815254211, "reward_std": 0.18954092264175415, "rewards/progression_diversity/mean": -0.09737183153629303, "rewards/progression_diversity/std": 0.1798097938299179, "rewards/symbolic_reward_accuracy/mean": 0.134765625, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.5712727904319763, "rewards/symbolic_reward_partial_score/std": 0.3447929322719574, "rewards/tag_count_reward/mean": -0.197265625, "rewards/tag_count_reward/std": 0.3983237147331238, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9921056032180786, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 482.0, "sampling/sampling_logp_difference/mean": 13.62202262878418, "step": 181 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.20868180692195892, "epoch": 0.2916666666666667, "grad_norm": 51.3494758605957, "learning_rate": 1e-06, "loss": 0.2408, "step": 182 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.18569938838481903, "epoch": 0.2932692307692308, "grad_norm": 55.675235748291016, "learning_rate": 1e-06, "loss": 0.268, "step": 183 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.1851130947470665, "epoch": 0.2948717948717949, "grad_norm": 65.24762725830078, "learning_rate": 1e-06, "loss": 0.3019, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5164.0, "completions/mean_length": 4260.142578125, "completions/mean_terminated_length": 1462.329345703125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.2189076542854309, "epoch": 0.296474358974359, "frac_reward_zero_std": 0.09375, "grad_norm": 344.04833984375, "learning_rate": 1e-06, "loss": 0.2133, "num_tokens": 205497869.0, "reward": 0.31361886858940125, "reward_std": 0.19572681188583374, "rewards/progression_diversity/mean": -0.07756727933883667, "rewards/progression_diversity/std": 0.17236009240150452, "rewards/symbolic_reward_accuracy/mean": 0.234375, "rewards/symbolic_reward_accuracy/std": 0.42402184009552, "rewards/symbolic_reward_partial_score/mean": 0.6332682371139526, "rewards/symbolic_reward_partial_score/std": 0.3539099097251892, "rewards/tag_count_reward/mean": -0.162109375, "rewards/tag_count_reward/std": 0.3689115643501282, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0022011995315552, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 484.0, "sampling/sampling_logp_difference/mean": 12.16126823425293, "step": 185 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.22352929413318634, "epoch": 0.2980769230769231, "grad_norm": 37.45823669433594, "learning_rate": 1e-06, "loss": 0.2166, "step": 186 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.2236320823431015, "epoch": 0.29967948717948717, "grad_norm": 4.9470672607421875, "learning_rate": 1e-06, "loss": 0.1972, "step": 187 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.22875476628541946, "epoch": 0.30128205128205127, "grad_norm": 0.024561388418078423, "learning_rate": 1e-06, "loss": 0.2051, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4226.0, "completions/mean_length": 3966.052734375, "completions/mean_terminated_length": 1388.742919921875, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "entropy": 0.22446440160274506, "epoch": 0.30288461538461536, "frac_reward_zero_std": 0.0, "grad_norm": 2142.739013671875, "learning_rate": 1e-06, "loss": 0.2362, "num_tokens": 208400968.0, "reward": 0.2870832681655884, "reward_std": 0.18633730709552765, "rewards/progression_diversity/mean": -0.07292823493480682, "rewards/progression_diversity/std": 0.16945891082286835, "rewards/symbolic_reward_accuracy/mean": 0.19140625, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.6227864623069763, "rewards/symbolic_reward_partial_score/std": 0.33123111724853516, "rewards/tag_count_reward/mean": -0.138671875, "rewards/tag_count_reward/std": 0.34594178199768066, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0030953884124756, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 484.0, "sampling/sampling_logp_difference/mean": 12.393356323242188, "step": 189 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.22635683417320251, "epoch": 0.30448717948717946, "grad_norm": 370.2960205078125, "learning_rate": 1e-06, "loss": 0.2681, "step": 190 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2461889684200287, "epoch": 0.3060897435897436, "grad_norm": 0.01940600946545601, "learning_rate": 1e-06, "loss": 0.1929, "step": 191 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.24533099681138992, "epoch": 0.3076923076923077, "grad_norm": 0.031975965946912766, "learning_rate": 1e-06, "loss": 0.1993, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5819.0, "completions/mean_length": 5224.373046875, "completions/mean_terminated_length": 1426.59423828125, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.1923881694674492, "epoch": 0.3092948717948718, "frac_reward_zero_std": 0.0, "grad_norm": 366.1446838378906, "learning_rate": 1e-06, "loss": 0.3055, "num_tokens": 212054839.0, "reward": 0.2073436677455902, "reward_std": 0.1643807291984558, "rewards/progression_diversity/mean": -0.1074308454990387, "rewards/progression_diversity/std": 0.19538284838199615, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.5267577767372131, "rewards/symbolic_reward_partial_score/std": 0.3376568853855133, "rewards/tag_count_reward/mean": -0.19921875, "rewards/tag_count_reward/std": 0.39980348944664, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9920334815979004, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 488.0, "sampling/sampling_logp_difference/mean": 14.228421211242676, "step": 193 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.2093525379896164, "epoch": 0.3108974358974359, "grad_norm": 143.63833618164062, "learning_rate": 1e-06, "loss": 0.22, "step": 194 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.20566636323928833, "epoch": 0.3125, "grad_norm": 507.2211608886719, "learning_rate": 1e-06, "loss": 0.2588, "step": 195 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.21372250467538834, "epoch": 0.3141025641025641, "grad_norm": 837.7913818359375, "learning_rate": 1e-06, "loss": 0.2089, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4510.0, "completions/mean_length": 4631.9921875, "completions/mean_terminated_length": 1265.8392333984375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.22118443250656128, "epoch": 0.3157051282051282, "frac_reward_zero_std": 0.03125, "grad_norm": 653.4398193359375, "learning_rate": 1e-06, "loss": 0.2032, "num_tokens": 215352051.0, "reward": 0.25279223918914795, "reward_std": 0.19195367395877838, "rewards/progression_diversity/mean": -0.10407879203557968, "rewards/progression_diversity/std": 0.20429863035678864, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.5993651747703552, "rewards/symbolic_reward_partial_score/std": 0.34102150797843933, "rewards/tag_count_reward/mean": -0.173828125, "rewards/tag_count_reward/std": 0.3793322443962097, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9943366050720215, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 488.0, "sampling/sampling_logp_difference/mean": 14.024613380432129, "step": 197 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.21031095832586288, "epoch": 0.3173076923076923, "grad_norm": 9.880915641784668, "learning_rate": 1e-06, "loss": 0.2061, "step": 198 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.21502617746591568, "epoch": 0.3189102564102564, "grad_norm": 159.4617156982422, "learning_rate": 1e-06, "loss": 0.2279, "step": 199 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2149796113371849, "epoch": 0.32051282051282054, "grad_norm": 0.021712809801101685, "learning_rate": 1e-06, "loss": 0.2856, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.30859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5767.0, "completions/mean_length": 5912.080078125, "completions/mean_terminated_length": 1238.17236328125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.19441968947649002, "epoch": 0.32211538461538464, "frac_reward_zero_std": 0.0, "grad_norm": 380.85797119140625, "learning_rate": 1e-06, "loss": 0.2383, "num_tokens": 219297724.0, "reward": 0.19366931915283203, "reward_std": 0.19040024280548096, "rewards/progression_diversity/mean": -0.148204505443573, "rewards/progression_diversity/std": 0.2301713079214096, "rewards/symbolic_reward_accuracy/mean": 0.099609375, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.528759777545929, "rewards/symbolic_reward_partial_score/std": 0.35425814986228943, "rewards/tag_count_reward/mean": -0.232421875, "rewards/tag_count_reward/std": 0.42278963327407837, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.979872465133667, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 490.0, "sampling/sampling_logp_difference/mean": 17.00942611694336, "step": 201 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6328125, "entropy": 0.15824927389621735, "epoch": 0.32371794871794873, "grad_norm": 166.4498291015625, "learning_rate": 1e-06, "loss": 0.3229, "step": 202 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.640625, "entropy": 0.20293635874986649, "epoch": 0.32532051282051283, "grad_norm": 0.022664187476038933, "learning_rate": 1e-06, "loss": 0.2402, "step": 203 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.671875, "entropy": 0.1808452159166336, "epoch": 0.3269230769230769, "grad_norm": 0.013569237664341927, "learning_rate": 1e-06, "loss": 0.3128, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.26171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4302.0, "completions/mean_length": 5205.919921875, "completions/mean_terminated_length": 1243.320068359375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "entropy": 0.2147229164838791, "epoch": 0.328525641025641, "frac_reward_zero_std": 0.0, "grad_norm": 922.9512939453125, "learning_rate": 1e-06, "loss": 0.2163, "num_tokens": 222885587.0, "reward": 0.2559148669242859, "reward_std": 0.22893401980400085, "rewards/progression_diversity/mean": -0.12775209546089172, "rewards/progression_diversity/std": 0.2234182357788086, "rewards/symbolic_reward_accuracy/mean": 0.17578125, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.5825684070587158, "rewards/symbolic_reward_partial_score/std": 0.36761772632598877, "rewards/tag_count_reward/mean": -0.23046875, "rewards/tag_count_reward/std": 0.42154473066329956, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9854786396026611, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 492.0, "sampling/sampling_logp_difference/mean": 16.239261627197266, "step": 205 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.2067190259695053, "epoch": 0.3301282051282051, "grad_norm": 220.25572204589844, "learning_rate": 1e-06, "loss": 0.2657, "step": 206 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.22213280200958252, "epoch": 0.3317307692307692, "grad_norm": 127.26791381835938, "learning_rate": 1e-06, "loss": 0.219, "step": 207 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.20553400367498398, "epoch": 0.3333333333333333, "grad_norm": 14.982529640197754, "learning_rate": 1e-06, "loss": 0.2511, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.283203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4364.0, "completions/mean_length": 5600.46484375, "completions/mean_terminated_length": 1339.9400634765625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "entropy": 0.18529720604419708, "epoch": 0.3349358974358974, "frac_reward_zero_std": 0.0, "grad_norm": 476.10418701171875, "learning_rate": 1e-06, "loss": 0.325, "num_tokens": 226612593.0, "reward": 0.20574292540550232, "reward_std": 0.2026432454586029, "rewards/progression_diversity/mean": -0.12785804271697998, "rewards/progression_diversity/std": 0.21478815376758575, "rewards/symbolic_reward_accuracy/mean": 0.11328125, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.5507487058639526, "rewards/symbolic_reward_partial_score/std": 0.3405325710773468, "rewards/tag_count_reward/mean": -0.26171875, "rewards/tag_count_reward/std": 0.44000017642974854, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.983894407749176, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 492.0, "sampling/sampling_logp_difference/mean": 16.456192016601562, "step": 209 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.19580994546413422, "epoch": 0.33653846153846156, "grad_norm": 216.3179931640625, "learning_rate": 1e-06, "loss": 0.2533, "step": 210 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.19553975760936737, "epoch": 0.33814102564102566, "grad_norm": 1221.7452392578125, "learning_rate": 1e-06, "loss": 0.3216, "step": 211 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.20991089940071106, "epoch": 0.33974358974358976, "grad_norm": 300.2898254394531, "learning_rate": 1e-06, "loss": 0.2328, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5346.0, "completions/mean_length": 5424.576171875, "completions/mean_terminated_length": 1460.5291748046875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.1831618994474411, "epoch": 0.34134615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 961.52490234375, "learning_rate": 1e-06, "loss": 0.3222, "num_tokens": 230279560.0, "reward": 0.22739171981811523, "reward_std": 0.2230047881603241, "rewards/progression_diversity/mean": -0.12215697765350342, "rewards/progression_diversity/std": 0.21225501596927643, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.5380859375, "rewards/symbolic_reward_partial_score/std": 0.3611067831516266, "rewards/tag_count_reward/mean": -0.2421875, "rewards/tag_count_reward/std": 0.42882615327835083, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.986311137676239, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 494.0, "sampling/sampling_logp_difference/mean": 16.12148666381836, "step": 213 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.22351820021867752, "epoch": 0.34294871794871795, "grad_norm": 112.74642181396484, "learning_rate": 1e-06, "loss": 0.2065, "step": 214 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.20397323369979858, "epoch": 0.34455128205128205, "grad_norm": 106.15531921386719, "learning_rate": 1e-06, "loss": 0.2369, "step": 215 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6328125, "entropy": 0.17963356524705887, "epoch": 0.34615384615384615, "grad_norm": 2.7653439044952393, "learning_rate": 1e-06, "loss": 0.2618, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 5181.009765625, "completions/mean_terminated_length": 1407.6788330078125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.21698909252882004, "epoch": 0.34775641025641024, "frac_reward_zero_std": 0.0, "grad_norm": 2655.067138671875, "learning_rate": 1e-06, "loss": 0.1933, "num_tokens": 233824525.0, "reward": 0.2999926805496216, "reward_std": 0.2547125220298767, "rewards/progression_diversity/mean": -0.12036378681659698, "rewards/progression_diversity/std": 0.2145136296749115, "rewards/symbolic_reward_accuracy/mean": 0.248046875, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.578857421875, "rewards/symbolic_reward_partial_score/std": 0.3651575446128845, "rewards/tag_count_reward/mean": -0.212890625, "rewards/tag_count_reward/std": 0.409751296043396, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9898908734321594, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 496.0, "sampling/sampling_logp_difference/mean": 15.950399398803711, "step": 217 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.19810165464878082, "epoch": 0.34935897435897434, "grad_norm": 90.42063903808594, "learning_rate": 1e-06, "loss": 0.2934, "step": 218 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.21933219581842422, "epoch": 0.35096153846153844, "grad_norm": 158.33421325683594, "learning_rate": 1e-06, "loss": 0.1813, "step": 219 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.22585007548332214, "epoch": 0.3525641025641026, "grad_norm": 114.27921295166016, "learning_rate": 1e-06, "loss": 0.2229, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.259765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4320.0, "completions/mean_length": 5331.865234375, "completions/mean_terminated_length": 1453.41162109375, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.20159848034381866, "epoch": 0.3541666666666667, "frac_reward_zero_std": 0.0625, "grad_norm": 953.3041381835938, "learning_rate": 1e-06, "loss": 0.2609, "num_tokens": 237447016.0, "reward": 0.2347480058670044, "reward_std": 0.1780080795288086, "rewards/progression_diversity/mean": -0.12334457039833069, "rewards/progression_diversity/std": 0.21702897548675537, "rewards/symbolic_reward_accuracy/mean": 0.146484375, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.5691568851470947, "rewards/symbolic_reward_partial_score/std": 0.3390918970108032, "rewards/tag_count_reward/mean": -0.2265625, "rewards/tag_count_reward/std": 0.4190165400505066, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9891093969345093, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 496.0, "sampling/sampling_logp_difference/mean": 16.061904907226562, "step": 221 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.20780060440301895, "epoch": 0.3557692307692308, "grad_norm": 171.24063110351562, "learning_rate": 1e-06, "loss": 0.2021, "step": 222 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.19532399624586105, "epoch": 0.3573717948717949, "grad_norm": 127.67808532714844, "learning_rate": 1e-06, "loss": 0.2701, "step": 223 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2121635228395462, "epoch": 0.358974358974359, "grad_norm": 36.59706497192383, "learning_rate": 1e-06, "loss": 0.19, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3714.0, "completions/mean_length": 4855.25390625, "completions/mean_terminated_length": 1326.0458984375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.17904973775148392, "epoch": 0.3605769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 378.9632568359375, "learning_rate": 1e-06, "loss": 0.3551, "num_tokens": 240758634.0, "reward": 0.26461654901504517, "reward_std": 0.2048269361257553, "rewards/progression_diversity/mean": -0.11061383038759232, "rewards/progression_diversity/std": 0.20703071355819702, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.5862630605697632, "rewards/symbolic_reward_partial_score/std": 0.34296321868896484, "rewards/tag_count_reward/mean": -0.203125, "rewards/tag_count_reward/std": 0.4027182459831238, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9891663789749146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 496.0, "sampling/sampling_logp_difference/mean": 16.460721969604492, "step": 225 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.20133854448795319, "epoch": 0.36217948717948717, "grad_norm": 1110.4901123046875, "learning_rate": 1e-06, "loss": 0.2465, "step": 226 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.22215987741947174, "epoch": 0.36378205128205127, "grad_norm": 26.63279151916504, "learning_rate": 1e-06, "loss": 0.2531, "step": 227 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.23379182070493698, "epoch": 0.36538461538461536, "grad_norm": 0.5691369771957397, "learning_rate": 1e-06, "loss": 0.1706, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3607.0, "completions/mean_length": 4385.185546875, "completions/mean_terminated_length": 1326.664306640625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "entropy": 0.23052329570055008, "epoch": 0.36698717948717946, "frac_reward_zero_std": 0.0, "grad_norm": 360.1962585449219, "learning_rate": 1e-06, "loss": 0.2534, "num_tokens": 243786505.0, "reward": 0.3250322937965393, "reward_std": 0.22688430547714233, "rewards/progression_diversity/mean": -0.09101086109876633, "rewards/progression_diversity/std": 0.18869265913963318, "rewards/symbolic_reward_accuracy/mean": 0.25390625, "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, "rewards/symbolic_reward_partial_score/mean": 0.6346517205238342, "rewards/symbolic_reward_partial_score/std": 0.33860599994659424, "rewards/tag_count_reward/mean": -0.16796875, "rewards/tag_count_reward/std": 0.374204158782959, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9971761703491211, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 498.0, "sampling/sampling_logp_difference/mean": 14.882648468017578, "step": 229 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2123500406742096, "epoch": 0.3685897435897436, "grad_norm": 343.66748046875, "learning_rate": 1e-06, "loss": 0.2335, "step": 230 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.24668648838996887, "epoch": 0.3701923076923077, "grad_norm": 318.5328674316406, "learning_rate": 1e-06, "loss": 0.1648, "step": 231 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.266249381005764, "epoch": 0.3717948717948718, "grad_norm": 0.021640097722411156, "learning_rate": 1e-06, "loss": 0.163, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.232421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4106.0, "completions/mean_length": 4834.337890625, "completions/mean_terminated_length": 1337.1119384765625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "entropy": 0.22328076511621475, "epoch": 0.3733974358974359, "frac_reward_zero_std": 0.0, "grad_norm": 1194.5416259765625, "learning_rate": 1e-06, "loss": 0.2147, "num_tokens": 247135110.0, "reward": 0.28227871656417847, "reward_std": 0.21940109133720398, "rewards/progression_diversity/mean": -0.10660064220428467, "rewards/progression_diversity/std": 0.20069687068462372, "rewards/symbolic_reward_accuracy/mean": 0.201171875, "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, "rewards/symbolic_reward_partial_score/mean": 0.6026855707168579, "rewards/symbolic_reward_partial_score/std": 0.3378095328807831, "rewards/tag_count_reward/mean": -0.181640625, "rewards/tag_count_reward/std": 0.38592514395713806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9878015518188477, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 500.0, "sampling/sampling_logp_difference/mean": 16.46212387084961, "step": 233 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2334306240081787, "epoch": 0.375, "grad_norm": 738.572509765625, "learning_rate": 1e-06, "loss": 0.2937, "step": 234 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.21482889354228973, "epoch": 0.3766025641025641, "grad_norm": 477.08978271484375, "learning_rate": 1e-06, "loss": 0.274, "step": 235 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.21712397038936615, "epoch": 0.3782051282051282, "grad_norm": 372.0608215332031, "learning_rate": 1e-06, "loss": 0.33, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3953.0, "completions/mean_length": 5198.51953125, "completions/mean_terminated_length": 1391.9423828125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "entropy": 0.1952293962240219, "epoch": 0.3798076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 534.4794311523438, "learning_rate": 1e-06, "loss": 0.223, "num_tokens": 250668384.0, "reward": 0.20283983647823334, "reward_std": 0.18369260430335999, "rewards/progression_diversity/mean": -0.1090836450457573, "rewards/progression_diversity/std": 0.19674594700336456, "rewards/symbolic_reward_accuracy/mean": 0.09765625, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.553466796875, "rewards/symbolic_reward_partial_score/std": 0.3304186761379242, "rewards/tag_count_reward/mean": -0.20703125, "rewards/tag_count_reward/std": 0.40557438135147095, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9892090559005737, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 500.0, "sampling/sampling_logp_difference/mean": 16.296913146972656, "step": 237 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.2330704852938652, "epoch": 0.3814102564102564, "grad_norm": 31.066238403320312, "learning_rate": 1e-06, "loss": 0.2238, "step": 238 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.21373382210731506, "epoch": 0.38301282051282054, "grad_norm": 120.66889953613281, "learning_rate": 1e-06, "loss": 0.2189, "step": 239 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.19722618907690048, "epoch": 0.38461538461538464, "grad_norm": 12.67758560180664, "learning_rate": 1e-06, "loss": 0.2972, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4564.0, "completions/mean_length": 4309.216796875, "completions/mean_terminated_length": 1305.246337890625, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "entropy": 0.23315541446208954, "epoch": 0.38621794871794873, "frac_reward_zero_std": 0.03125, "grad_norm": 488.22998046875, "learning_rate": 1e-06, "loss": 0.1621, "num_tokens": 253703711.0, "reward": 0.26993727684020996, "reward_std": 0.21065741777420044, "rewards/progression_diversity/mean": -0.09562888741493225, "rewards/progression_diversity/std": 0.19961923360824585, "rewards/symbolic_reward_accuracy/mean": 0.171875, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.6106607913970947, "rewards/symbolic_reward_partial_score/std": 0.32578223943710327, "rewards/tag_count_reward/mean": -0.154296875, "rewards/tag_count_reward/std": 0.36158639192581177, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.994120717048645, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 504.0, "sampling/sampling_logp_difference/mean": 15.699442863464355, "step": 241 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.22725315392017365, "epoch": 0.38782051282051283, "grad_norm": 728.95068359375, "learning_rate": 1e-06, "loss": 0.2635, "step": 242 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.24956174194812775, "epoch": 0.3894230769230769, "grad_norm": 216.6647491455078, "learning_rate": 1e-06, "loss": 0.162, "step": 243 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.22464978694915771, "epoch": 0.391025641025641, "grad_norm": 351.8695373535156, "learning_rate": 1e-06, "loss": 0.241, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.236328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4847.0, "completions/mean_length": 4997.755859375, "completions/mean_terminated_length": 1474.135498046875, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.20301750302314758, "epoch": 0.3926282051282051, "frac_reward_zero_std": 0.0, "grad_norm": 481.3808898925781, "learning_rate": 1e-06, "loss": 0.2076, "num_tokens": 257209026.0, "reward": 0.2624889016151428, "reward_std": 0.19924888014793396, "rewards/progression_diversity/mean": -0.10560305416584015, "rewards/progression_diversity/std": 0.1964842677116394, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.5640299320220947, "rewards/symbolic_reward_partial_score/std": 0.3424655795097351, "rewards/tag_count_reward/mean": -0.181640625, "rewards/tag_count_reward/std": 0.38592514395713806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9906989336013794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 504.0, "sampling/sampling_logp_difference/mean": 16.07154083251953, "step": 245 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2440878227353096, "epoch": 0.3942307692307692, "grad_norm": 111.19556427001953, "learning_rate": 1e-06, "loss": 0.1726, "step": 246 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.22275415807962418, "epoch": 0.3958333333333333, "grad_norm": 237.71017456054688, "learning_rate": 1e-06, "loss": 0.2368, "step": 247 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.3984375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.2082279548048973, "epoch": 0.3974358974358974, "grad_norm": 15.373961448669434, "learning_rate": 1e-06, "loss": 0.3212, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4378.0, "completions/mean_length": 5073.48828125, "completions/mean_terminated_length": 1458.78857421875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "entropy": 0.2016061171889305, "epoch": 0.39903846153846156, "frac_reward_zero_std": 0.0, "grad_norm": 1202.8836669921875, "learning_rate": 1e-06, "loss": 0.2794, "num_tokens": 260669868.0, "reward": 0.18259669840335846, "reward_std": 0.16573688387870789, "rewards/progression_diversity/mean": -0.1065426617860794, "rewards/progression_diversity/std": 0.19588853418827057, "rewards/symbolic_reward_accuracy/mean": 0.068359375, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.5438476204872131, "rewards/symbolic_reward_partial_score/std": 0.32987871766090393, "rewards/tag_count_reward/mean": -0.205078125, "rewards/tag_count_reward/std": 0.4041535556316376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9889222383499146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 504.0, "sampling/sampling_logp_difference/mean": 16.378686904907227, "step": 249 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.2196366786956787, "epoch": 0.40064102564102566, "grad_norm": 56.70030975341797, "learning_rate": 1e-06, "loss": 0.2487, "step": 250 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.2119002267718315, "epoch": 0.40224358974358976, "grad_norm": 3.978065013885498, "learning_rate": 1e-06, "loss": 0.2374, "step": 251 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.21355099976062775, "epoch": 0.40384615384615385, "grad_norm": 0.02542886696755886, "learning_rate": 1e-06, "loss": 0.2578, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 4913.615234375, "completions/mean_terminated_length": 1402.27294921875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.22765664756298065, "epoch": 0.40544871794871795, "frac_reward_zero_std": 0.0, "grad_norm": 457.21380615234375, "learning_rate": 1e-06, "loss": 0.1611, "num_tokens": 264010663.0, "reward": 0.2566929757595062, "reward_std": 0.22699853777885437, "rewards/progression_diversity/mean": -0.10218832641839981, "rewards/progression_diversity/std": 0.19470685720443726, "rewards/symbolic_reward_accuracy/mean": 0.177734375, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.5693359375, "rewards/symbolic_reward_partial_score/std": 0.34047579765319824, "rewards/tag_count_reward/mean": -0.197265625, "rewards/tag_count_reward/std": 0.3983237147331238, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911985397338867, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 16.304428100585938, "step": 253 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.21794138103723526, "epoch": 0.40705128205128205, "grad_norm": 297.2830505371094, "learning_rate": 1e-06, "loss": 0.3116, "step": 254 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.3515625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.2055223435163498, "epoch": 0.40865384615384615, "grad_norm": 0.2487526535987854, "learning_rate": 1e-06, "loss": 0.2665, "step": 255 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.21468888968229294, "epoch": 0.41025641025641024, "grad_norm": 0.02222493290901184, "learning_rate": 1e-06, "loss": 0.2176, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.244140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3774.0, "completions/mean_length": 5068.212890625, "completions/mean_terminated_length": 1413.242919921875, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.21585044264793396, "epoch": 0.41185897435897434, "frac_reward_zero_std": 0.0, "grad_norm": 821.6210327148438, "learning_rate": 1e-06, "loss": 0.2695, "num_tokens": 267516868.0, "reward": 0.2098761796951294, "reward_std": 0.16895067691802979, "rewards/progression_diversity/mean": -0.11101497709751129, "rewards/progression_diversity/std": 0.20189963281154633, "rewards/symbolic_reward_accuracy/mean": 0.11328125, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.5431314706802368, "rewards/symbolic_reward_partial_score/std": 0.334158331155777, "rewards/tag_count_reward/mean": -0.19921875, "rewards/tag_count_reward/std": 0.39980348944664, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9911006689071655, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 17.146984100341797, "step": 257 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.22612430155277252, "epoch": 0.41346153846153844, "grad_norm": 543.9043579101562, "learning_rate": 1e-06, "loss": 0.3986, "step": 258 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.244947150349617, "epoch": 0.4150641025641026, "grad_norm": 313.92132568359375, "learning_rate": 1e-06, "loss": 0.1894, "step": 259 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6484375, "entropy": 0.2207338884472847, "epoch": 0.4166666666666667, "grad_norm": 0.4175088405609131, "learning_rate": 1e-06, "loss": 0.2577, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5952.0, "completions/mean_length": 5339.615234375, "completions/mean_terminated_length": 1503.144775390625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.23514293134212494, "epoch": 0.4182692307692308, "frac_reward_zero_std": 0.03125, "grad_norm": 860.50244140625, "learning_rate": 1e-06, "loss": 0.2181, "num_tokens": 271111039.0, "reward": 0.2324339747428894, "reward_std": 0.19143269956111908, "rewards/progression_diversity/mean": -0.11207312345504761, "rewards/progression_diversity/std": 0.19719642400741577, "rewards/symbolic_reward_accuracy/mean": 0.16015625, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.524609386920929, "rewards/symbolic_reward_partial_score/std": 0.3499920070171356, "rewards/tag_count_reward/mean": -0.19921875, "rewards/tag_count_reward/std": 0.39980348944664, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9931377172470093, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 16.724380493164062, "step": 261 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.19652889668941498, "epoch": 0.4198717948717949, "grad_norm": 72.61575317382812, "learning_rate": 1e-06, "loss": 0.2661, "step": 262 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2538982629776001, "epoch": 0.421474358974359, "grad_norm": 1.3014869689941406, "learning_rate": 1e-06, "loss": 0.1651, "step": 263 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.3515625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.21324076503515244, "epoch": 0.4230769230769231, "grad_norm": 7.824087142944336, "learning_rate": 1e-06, "loss": 0.3078, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4048.0, "completions/mean_length": 4864.2734375, "completions/mean_terminated_length": 1414.2030029296875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "entropy": 0.24157585948705673, "epoch": 0.42467948717948717, "frac_reward_zero_std": 0.0, "grad_norm": 681.8773193359375, "learning_rate": 1e-06, "loss": 0.2296, "num_tokens": 274522011.0, "reward": 0.23941701650619507, "reward_std": 0.200226292014122, "rewards/progression_diversity/mean": -0.0978497862815857, "rewards/progression_diversity/std": 0.1862473040819168, "rewards/symbolic_reward_accuracy/mean": 0.1484375, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.5630371570587158, "rewards/symbolic_reward_partial_score/std": 0.34202542901039124, "rewards/tag_count_reward/mean": -0.17578125, "rewards/tag_count_reward/std": 0.3810062110424042, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9933131337165833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 16.649776458740234, "step": 265 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.25425516068935394, "epoch": 0.42628205128205127, "grad_norm": 0.02002408355474472, "learning_rate": 1e-06, "loss": 0.2305, "step": 266 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6484375, "entropy": 0.22516103833913803, "epoch": 0.42788461538461536, "grad_norm": 0.027018295601010323, "learning_rate": 1e-06, "loss": 0.2392, "step": 267 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.237248495221138, "epoch": 0.42948717948717946, "grad_norm": 2.79491925239563, "learning_rate": 1e-06, "loss": 0.2575, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 4960.13671875, "completions/mean_terminated_length": 1463.03564453125, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.24601341784000397, "epoch": 0.4310897435897436, "frac_reward_zero_std": 0.0, "grad_norm": 1408.3753662109375, "learning_rate": 1e-06, "loss": 0.2201, "num_tokens": 278050673.0, "reward": 0.20770949125289917, "reward_std": 0.19745418429374695, "rewards/progression_diversity/mean": -0.10600487887859344, "rewards/progression_diversity/std": 0.1964379996061325, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.5253255367279053, "rewards/symbolic_reward_partial_score/std": 0.3296595513820648, "rewards/tag_count_reward/mean": -0.19140625, "rewards/tag_count_reward/std": 0.3937928080558777, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9925892949104309, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 16.926132202148438, "step": 269 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.24742096662521362, "epoch": 0.4326923076923077, "grad_norm": 0.18998423218727112, "learning_rate": 1e-06, "loss": 0.1687, "step": 270 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.23993898928165436, "epoch": 0.4342948717948718, "grad_norm": 0.016234349459409714, "learning_rate": 1e-06, "loss": 0.3282, "step": 271 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.22421810775995255, "epoch": 0.4358974358974359, "grad_norm": 0.023402217775583267, "learning_rate": 1e-06, "loss": 0.2581, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5016.0, "completions/mean_length": 4715.15234375, "completions/mean_terminated_length": 1595.7574462890625, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.2802828401327133, "epoch": 0.4375, "frac_reward_zero_std": 0.0, "grad_norm": 389.8497619628906, "learning_rate": 1e-06, "loss": 0.1786, "num_tokens": 281394383.0, "reward": 0.2193732112646103, "reward_std": 0.20153236389160156, "rewards/progression_diversity/mean": -0.09344275295734406, "rewards/progression_diversity/std": 0.18624533712863922, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.5436035394668579, "rewards/symbolic_reward_partial_score/std": 0.3351518511772156, "rewards/tag_count_reward/mean": -0.177734375, "rewards/tag_count_reward/std": 0.3826628625392914, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0049525499343872, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 14.498746871948242, "step": 273 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.25365348160266876, "epoch": 0.4391025641025641, "grad_norm": 250.75274658203125, "learning_rate": 1e-06, "loss": 0.2038, "step": 274 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.2545860484242439, "epoch": 0.4407051282051282, "grad_norm": 0.01937059499323368, "learning_rate": 1e-06, "loss": 0.2308, "step": 275 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.23184729367494583, "epoch": 0.4423076923076923, "grad_norm": 0.2053852677345276, "learning_rate": 1e-06, "loss": 0.2823, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.251953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4437.0, "completions/mean_length": 5307.53125, "completions/mean_terminated_length": 1576.814697265625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.22293350100517273, "epoch": 0.4439102564102564, "frac_reward_zero_std": 0.0, "grad_norm": 594.2986450195312, "learning_rate": 1e-06, "loss": 0.3091, "num_tokens": 285031903.0, "reward": 0.19161370396614075, "reward_std": 0.18198563158512115, "rewards/progression_diversity/mean": -0.11011378467082977, "rewards/progression_diversity/std": 0.1955021768808365, "rewards/symbolic_reward_accuracy/mean": 0.091796875, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.5317057371139526, "rewards/symbolic_reward_partial_score/std": 0.3342364728450775, "rewards/tag_count_reward/mean": -0.21875, "rewards/tag_count_reward/std": 0.41380295157432556, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9914793372154236, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 17.115394592285156, "step": 277 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2029179409146309, "epoch": 0.44551282051282054, "grad_norm": 56.31192398071289, "learning_rate": 1e-06, "loss": 0.3572, "step": 278 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3828125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6328125, "entropy": 0.25365860760211945, "epoch": 0.44711538461538464, "grad_norm": 0.015985539183020592, "learning_rate": 1e-06, "loss": 0.1795, "step": 279 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.25643934309482574, "epoch": 0.44871794871794873, "grad_norm": 8.044807434082031, "learning_rate": 1e-06, "loss": 0.2118, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.236328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5886.0, "completions/mean_length": 5134.87890625, "completions/mean_terminated_length": 1653.693115234375, "completions/min_length": 457.0, "completions/min_terminated_length": 457.0, "entropy": 0.27302658557891846, "epoch": 0.45032051282051283, "frac_reward_zero_std": 0.0, "grad_norm": 5191.970703125, "learning_rate": 1e-06, "loss": 0.1912, "num_tokens": 288559489.0, "reward": 0.20244288444519043, "reward_std": 0.1676143854856491, "rewards/progression_diversity/mean": -0.10288003087043762, "rewards/progression_diversity/std": 0.19318129122257233, "rewards/symbolic_reward_accuracy/mean": 0.123046875, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.5037597417831421, "rewards/symbolic_reward_partial_score/std": 0.34105032682418823, "rewards/tag_count_reward/mean": -0.21484375, "rewards/tag_count_reward/std": 0.4111155867576599, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008915662765503, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 15.52652359008789, "step": 281 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.2505979463458061, "epoch": 0.4519230769230769, "grad_norm": 1842.8763427734375, "learning_rate": 1e-06, "loss": 0.2985, "step": 282 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.3671875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.23081808537244797, "epoch": 0.453525641025641, "grad_norm": 0.18885204195976257, "learning_rate": 1e-06, "loss": 0.3087, "step": 283 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.2553938925266266, "epoch": 0.4551282051282051, "grad_norm": 0.015816714614629745, "learning_rate": 1e-06, "loss": 0.224, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4693.0, "completions/mean_length": 4956.9609375, "completions/mean_terminated_length": 1683.889404296875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.2509460896253586, "epoch": 0.4567307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 501.21697998046875, "learning_rate": 1e-06, "loss": 0.2214, "num_tokens": 291920845.0, "reward": 0.19516658782958984, "reward_std": 0.18093836307525635, "rewards/progression_diversity/mean": -0.09760037809610367, "rewards/progression_diversity/std": 0.18880048394203186, "rewards/symbolic_reward_accuracy/mean": 0.09375, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.5301106572151184, "rewards/symbolic_reward_partial_score/std": 0.3172612190246582, "rewards/tag_count_reward/mean": -0.19140625, "rewards/tag_count_reward/std": 0.3937928080558777, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9981337189674377, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 15.91734790802002, "step": 285 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.2802078127861023, "epoch": 0.4583333333333333, "grad_norm": 113.05368041992188, "learning_rate": 1e-06, "loss": 0.2016, "step": 286 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.65625, "entropy": 0.24139665812253952, "epoch": 0.4599358974358974, "grad_norm": 0.025173721835017204, "learning_rate": 1e-06, "loss": 0.292, "step": 287 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.27211469411849976, "epoch": 0.46153846153846156, "grad_norm": 3.040055513381958, "learning_rate": 1e-06, "loss": 0.2398, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.220703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5427.0, "completions/mean_length": 4934.23046875, "completions/mean_terminated_length": 1691.5638427734375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.2553727924823761, "epoch": 0.46314102564102566, "frac_reward_zero_std": 0.0, "grad_norm": 1051.9158935546875, "learning_rate": 1e-06, "loss": 0.2454, "num_tokens": 295288899.0, "reward": 0.2229723334312439, "reward_std": 0.21836064755916595, "rewards/progression_diversity/mean": -0.09778685122728348, "rewards/progression_diversity/std": 0.1902952492237091, "rewards/symbolic_reward_accuracy/mean": 0.13671875, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.540771484375, "rewards/symbolic_reward_partial_score/std": 0.33182287216186523, "rewards/tag_count_reward/mean": -0.203125, "rewards/tag_count_reward/std": 0.4027182459831238, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.004034161567688, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 14.605140686035156, "step": 289 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.24111786484718323, "epoch": 0.46474358974358976, "grad_norm": 3.2952563762664795, "learning_rate": 1e-06, "loss": 0.2509, "step": 290 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.2719377726316452, "epoch": 0.46634615384615385, "grad_norm": 0.019442997872829437, "learning_rate": 1e-06, "loss": 0.1819, "step": 291 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.65625, "entropy": 0.2737286537885666, "epoch": 0.46794871794871795, "grad_norm": 0.02424200251698494, "learning_rate": 1e-06, "loss": 0.2014, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4168.0, "completions/mean_length": 4012.345703125, "completions/mean_terminated_length": 1653.0999755859375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "entropy": 0.2804551422595978, "epoch": 0.46955128205128205, "frac_reward_zero_std": 0.0, "grad_norm": 807.3659057617188, "learning_rate": 1e-06, "loss": 0.1918, "num_tokens": 298231508.0, "reward": 0.1925782412290573, "reward_std": 0.16427671909332275, "rewards/progression_diversity/mean": -0.07176616787910461, "rewards/progression_diversity/std": 0.16853131353855133, "rewards/symbolic_reward_accuracy/mean": 0.08203125, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.5264810919761658, "rewards/symbolic_reward_partial_score/std": 0.3188028335571289, "rewards/tag_count_reward/mean": -0.138671875, "rewards/tag_count_reward/std": 0.34594178199768066, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0244982242584229, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 504.0, "sampling/sampling_logp_difference/mean": 11.09280776977539, "step": 293 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.30911885201931, "epoch": 0.47115384615384615, "grad_norm": 343.7190246582031, "learning_rate": 1e-06, "loss": 0.1829, "step": 294 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.2846119552850723, "epoch": 0.47275641025641024, "grad_norm": 650.09619140625, "learning_rate": 1e-06, "loss": 0.2431, "step": 295 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3162567764520645, "epoch": 0.47435897435897434, "grad_norm": 0.03714370355010033, "learning_rate": 1e-06, "loss": 0.1759, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.228515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4304.0, "completions/mean_length": 5002.0859375, "completions/mean_terminated_length": 1630.734130859375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.2860996425151825, "epoch": 0.47596153846153844, "frac_reward_zero_std": 0.0, "grad_norm": 530.2379150390625, "learning_rate": 1e-06, "loss": 0.2122, "num_tokens": 301642864.0, "reward": 0.253789484500885, "reward_std": 0.20528803765773773, "rewards/progression_diversity/mean": -0.10005459934473038, "rewards/progression_diversity/std": 0.1889607459306717, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.5315917730331421, "rewards/symbolic_reward_partial_score/std": 0.363020122051239, "rewards/tag_count_reward/mean": -0.18359375, "rewards/tag_count_reward/std": 0.3875311613082886, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0061748027801514, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 14.482137680053711, "step": 297 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.27232350409030914, "epoch": 0.4775641025641026, "grad_norm": 98.60669708251953, "learning_rate": 1e-06, "loss": 0.2171, "step": 298 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.40625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.2478073537349701, "epoch": 0.4791666666666667, "grad_norm": 10.104867935180664, "learning_rate": 1e-06, "loss": 0.2962, "step": 299 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.28513647615909576, "epoch": 0.4807692307692308, "grad_norm": 0.036038998514413834, "learning_rate": 1e-06, "loss": 0.2311, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4377.0, "completions/mean_length": 4445.552734375, "completions/mean_terminated_length": 1619.54345703125, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "entropy": 0.3068581074476242, "epoch": 0.4823717948717949, "frac_reward_zero_std": 0.0, "grad_norm": 330.840087890625, "learning_rate": 1e-06, "loss": 0.1263, "num_tokens": 304794523.0, "reward": 0.2764816880226135, "reward_std": 0.20622991025447845, "rewards/progression_diversity/mean": -0.08181288093328476, "rewards/progression_diversity/std": 0.17579428851604462, "rewards/symbolic_reward_accuracy/mean": 0.205078125, "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, "rewards/symbolic_reward_partial_score/mean": 0.5688639283180237, "rewards/symbolic_reward_partial_score/std": 0.3486393988132477, "rewards/tag_count_reward/mean": -0.1640625, "rewards/tag_count_reward/std": 0.37069445848464966, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0138086080551147, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 508.0, "sampling/sampling_logp_difference/mean": 13.007370948791504, "step": 301 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2967989444732666, "epoch": 0.483974358974359, "grad_norm": 484.8408203125, "learning_rate": 1e-06, "loss": 0.2192, "step": 302 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.26128391921520233, "epoch": 0.4855769230769231, "grad_norm": 281.1661376953125, "learning_rate": 1e-06, "loss": 0.2839, "step": 303 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.2628230154514313, "epoch": 0.48717948717948717, "grad_norm": 0.027397677302360535, "learning_rate": 1e-06, "loss": 0.2977, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4609.0, "completions/mean_length": 4459.234375, "completions/mean_terminated_length": 1636.4637451171875, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "entropy": 0.28316499292850494, "epoch": 0.48878205128205127, "frac_reward_zero_std": 0.03125, "grad_norm": 847.23779296875, "learning_rate": 1e-06, "loss": 0.2316, "num_tokens": 307925859.0, "reward": 0.25070130825042725, "reward_std": 0.16120730340480804, "rewards/progression_diversity/mean": -0.08758464455604553, "rewards/progression_diversity/std": 0.18611328303813934, "rewards/symbolic_reward_accuracy/mean": 0.1640625, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.5677571296691895, "rewards/symbolic_reward_partial_score/std": 0.333774209022522, "rewards/tag_count_reward/mean": -0.171875, "rewards/tag_count_reward/std": 0.3776407241821289, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0140712261199951, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 512.0, "sampling/sampling_logp_difference/mean": 13.223786354064941, "step": 305 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.375, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.2871243506669998, "epoch": 0.49038461538461536, "grad_norm": 393.9886474609375, "learning_rate": 1e-06, "loss": 0.2205, "step": 306 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3013138175010681, "epoch": 0.49198717948717946, "grad_norm": 0.020085537806153297, "learning_rate": 1e-06, "loss": 0.1944, "step": 307 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.2749166488647461, "epoch": 0.4935897435897436, "grad_norm": 0.01898149587213993, "learning_rate": 1e-06, "loss": 0.2895, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5109.0, "completions/mean_length": 4079.05859375, "completions/mean_terminated_length": 1594.9625244140625, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.2986243665218353, "epoch": 0.4951923076923077, "frac_reward_zero_std": 0.0625, "grad_norm": 579.7413330078125, "learning_rate": 1e-06, "loss": 0.2209, "num_tokens": 310795841.0, "reward": 0.24147483706474304, "reward_std": 0.18043410778045654, "rewards/progression_diversity/mean": -0.0688263550400734, "rewards/progression_diversity/std": 0.15821245312690735, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.589111328125, "rewards/symbolic_reward_partial_score/std": 0.3147651255130768, "rewards/tag_count_reward/mean": -0.142578125, "rewards/tag_count_reward/std": 0.3499840497970581, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0225772857666016, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 512.0, "sampling/sampling_logp_difference/mean": 11.870687484741211, "step": 309 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3223755210638046, "epoch": 0.4967948717948718, "grad_norm": 256.777099609375, "learning_rate": 1e-06, "loss": 0.1514, "step": 310 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3142601400613785, "epoch": 0.4983974358974359, "grad_norm": 0.41229262948036194, "learning_rate": 1e-06, "loss": 0.1924, "step": 311 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.29151175916194916, "epoch": 0.5, "grad_norm": 0.013734478503465652, "learning_rate": 1e-06, "loss": 0.2275, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.150390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4919.0, "completions/mean_length": 3832.859375, "completions/mean_terminated_length": 1611.1632080078125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "entropy": 0.29491689801216125, "epoch": 0.5016025641025641, "frac_reward_zero_std": 0.0625, "grad_norm": 841.0869140625, "learning_rate": 1e-06, "loss": 0.2169, "num_tokens": 313608521.0, "reward": 0.32326555252075195, "reward_std": 0.19986380636692047, "rewards/progression_diversity/mean": -0.06456150114536285, "rewards/progression_diversity/std": 0.15824860334396362, "rewards/symbolic_reward_accuracy/mean": 0.251953125, "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, "rewards/symbolic_reward_partial_score/mean": 0.6194173097610474, "rewards/symbolic_reward_partial_score/std": 0.3335736393928528, "rewards/tag_count_reward/mean": -0.130859375, "rewards/tag_count_reward/std": 0.33757632970809937, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.027578353881836, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 512.0, "sampling/sampling_logp_difference/mean": 10.916871070861816, "step": 313 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.33330613374710083, "epoch": 0.5032051282051282, "grad_norm": 125.96926879882812, "learning_rate": 1e-06, "loss": 0.1417, "step": 314 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2990605980157852, "epoch": 0.5048076923076923, "grad_norm": 0.028910215944051743, "learning_rate": 1e-06, "loss": 0.1997, "step": 315 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.31929296255111694, "epoch": 0.5064102564102564, "grad_norm": 114.29574584960938, "learning_rate": 1e-06, "loss": 0.1296, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6009.0, "completions/mean_length": 4275.66015625, "completions/mean_terminated_length": 1762.6085205078125, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.32325243949890137, "epoch": 0.5080128205128205, "frac_reward_zero_std": 0.0, "grad_norm": 370.2263488769531, "learning_rate": 1e-06, "loss": 0.1346, "num_tokens": 316649083.0, "reward": 0.24201995134353638, "reward_std": 0.18078970909118652, "rewards/progression_diversity/mean": -0.06900203227996826, "rewards/progression_diversity/std": 0.1568773090839386, "rewards/symbolic_reward_accuracy/mean": 0.142578125, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.5694499015808105, "rewards/symbolic_reward_partial_score/std": 0.31546229124069214, "rewards/tag_count_reward/mean": -0.13671875, "rewards/tag_count_reward/std": 0.3438861668109894, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0173547267913818, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 512.0, "sampling/sampling_logp_difference/mean": 12.444093704223633, "step": 317 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.2793115973472595, "epoch": 0.5096153846153846, "grad_norm": 7.101436614990234, "learning_rate": 1e-06, "loss": 0.2415, "step": 318 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.2891732454299927, "epoch": 0.5112179487179487, "grad_norm": 0.021407851949334145, "learning_rate": 1e-06, "loss": 0.2313, "step": 319 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.28606168925762177, "epoch": 0.5128205128205128, "grad_norm": 0.01678905449807644, "learning_rate": 1e-06, "loss": 0.2457, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4942.0, "completions/mean_length": 3954.216796875, "completions/mean_terminated_length": 1720.2926025390625, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.2966272532939911, "epoch": 0.5144230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 758.809814453125, "learning_rate": 1e-06, "loss": 0.2448, "num_tokens": 319542234.0, "reward": 0.23621943593025208, "reward_std": 0.17420727014541626, "rewards/progression_diversity/mean": -0.062140051275491714, "rewards/progression_diversity/std": 0.15392348170280457, "rewards/symbolic_reward_accuracy/mean": 0.126953125, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.5752767324447632, "rewards/symbolic_reward_partial_score/std": 0.3146626949310303, "rewards/tag_count_reward/mean": -0.119140625, "rewards/tag_count_reward/std": 0.32427072525024414, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0240211486816406, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 512.0, "sampling/sampling_logp_difference/mean": 11.527631759643555, "step": 321 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.33031274378299713, "epoch": 0.5160256410256411, "grad_norm": 0.021069565787911415, "learning_rate": 1e-06, "loss": 0.1706, "step": 322 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.3213012218475342, "epoch": 0.5176282051282052, "grad_norm": 0.34873709082603455, "learning_rate": 1e-06, "loss": 0.1742, "step": 323 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.30144381523132324, "epoch": 0.5192307692307693, "grad_norm": 0.02283627726137638, "learning_rate": 1e-06, "loss": 0.2115, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4747.0, "completions/mean_length": 4465.17578125, "completions/mean_terminated_length": 1854.3857421875, "completions/min_length": 530.0, "completions/min_terminated_length": 530.0, "entropy": 0.32395896315574646, "epoch": 0.5208333333333334, "frac_reward_zero_std": 0.03125, "grad_norm": 343.3765563964844, "learning_rate": 1e-06, "loss": 0.1178, "num_tokens": 322758580.0, "reward": 0.20879912376403809, "reward_std": 0.1630467027425766, "rewards/progression_diversity/mean": -0.07272521406412125, "rewards/progression_diversity/std": 0.16325299441814423, "rewards/symbolic_reward_accuracy/mean": 0.099609375, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.5441243648529053, "rewards/symbolic_reward_partial_score/std": 0.3322213888168335, "rewards/tag_count_reward/mean": -0.134765625, "rewards/tag_count_reward/std": 0.3418070077896118, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0155704021453857, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 512.0, "sampling/sampling_logp_difference/mean": 12.669414520263672, "step": 325 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3014257103204727, "epoch": 0.5224358974358975, "grad_norm": 0.020518580451607704, "learning_rate": 1e-06, "loss": 0.2602, "step": 326 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.29575052857398987, "epoch": 0.5240384615384616, "grad_norm": 530.0026245117188, "learning_rate": 1e-06, "loss": 0.2193, "step": 327 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.3828125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.2522171512246132, "epoch": 0.5256410256410257, "grad_norm": 0.015645645558834076, "learning_rate": 1e-06, "loss": 0.3602, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5689.0, "completions/mean_length": 4896.400390625, "completions/mean_terminated_length": 1968.1888427734375, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "entropy": 0.28659459948539734, "epoch": 0.5272435897435898, "frac_reward_zero_std": 0.0, "grad_norm": 472.4561462402344, "learning_rate": 1e-06, "loss": 0.1946, "num_tokens": 326107713.0, "reward": 0.27258506417274475, "reward_std": 0.22720691561698914, "rewards/progression_diversity/mean": -0.0842689722776413, "rewards/progression_diversity/std": 0.17504683136940002, "rewards/symbolic_reward_accuracy/mean": 0.20703125, "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, "rewards/symbolic_reward_partial_score/mean": 0.554003894329071, "rewards/symbolic_reward_partial_score/std": 0.3567145764827728, "rewards/tag_count_reward/mean": -0.169921875, "rewards/tag_count_reward/std": 0.3759314715862274, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.014817714691162, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 512.0, "sampling/sampling_logp_difference/mean": 12.392074584960938, "step": 329 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.2568201795220375, "epoch": 0.5288461538461539, "grad_norm": 0.02229255437850952, "learning_rate": 1e-06, "loss": 0.2685, "step": 330 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.2770218998193741, "epoch": 0.530448717948718, "grad_norm": 0.020292697474360466, "learning_rate": 1e-06, "loss": 0.2106, "step": 331 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.27906210720539093, "epoch": 0.532051282051282, "grad_norm": 0.01652703620493412, "learning_rate": 1e-06, "loss": 0.2365, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.240234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6400.0, "completions/mean_length": 5536.8125, "completions/mean_terminated_length": 2106.98193359375, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.24760161340236664, "epoch": 0.5336538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 698.7391967773438, "learning_rate": 1e-06, "loss": 0.2491, "num_tokens": 329775969.0, "reward": 0.1934526264667511, "reward_std": 0.1800420880317688, "rewards/progression_diversity/mean": -0.09467939287424088, "rewards/progression_diversity/std": 0.17836976051330566, "rewards/symbolic_reward_accuracy/mean": 0.107421875, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.49760740995407104, "rewards/symbolic_reward_partial_score/std": 0.3303731083869934, "rewards/tag_count_reward/mean": -0.193359375, "rewards/tag_count_reward/std": 0.39531853795051575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0133090019226074, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 12.529105186462402, "step": 333 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.27296438813209534, "epoch": 0.5352564102564102, "grad_norm": 238.9752960205078, "learning_rate": 1e-06, "loss": 0.2225, "step": 334 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.248204804956913, "epoch": 0.5368589743589743, "grad_norm": 106.70677185058594, "learning_rate": 1e-06, "loss": 0.2809, "step": 335 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.2727896124124527, "epoch": 0.5384615384615384, "grad_norm": 0.031822893768548965, "learning_rate": 1e-06, "loss": 0.2527, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5026.0, "completions/mean_length": 5709.119140625, "completions/mean_terminated_length": 2001.002685546875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.25325363129377365, "epoch": 0.5400641025641025, "frac_reward_zero_std": 0.0, "grad_norm": 363.06060791015625, "learning_rate": 1e-06, "loss": 0.2604, "num_tokens": 333527678.0, "reward": 0.23530638217926025, "reward_std": 0.22434313595294952, "rewards/progression_diversity/mean": -0.1065697893500328, "rewards/progression_diversity/std": 0.18764851987361908, "rewards/symbolic_reward_accuracy/mean": 0.171875, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.5112141966819763, "rewards/symbolic_reward_partial_score/std": 0.3657578229904175, "rewards/tag_count_reward/mean": -0.201171875, "rewards/tag_count_reward/std": 0.4012683033943176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.006093144416809, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 14.130290031433105, "step": 337 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.24173004925251007, "epoch": 0.5416666666666666, "grad_norm": 319.6545715332031, "learning_rate": 1e-06, "loss": 0.2882, "step": 338 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2622825503349304, "epoch": 0.5432692307692307, "grad_norm": 219.5463409423828, "learning_rate": 1e-06, "loss": 0.2093, "step": 339 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.2560427188873291, "epoch": 0.5448717948717948, "grad_norm": 0.026842230930924416, "learning_rate": 1e-06, "loss": 0.2614, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.279296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5854.0, "completions/mean_length": 6195.84765625, "completions/mean_terminated_length": 2247.593505859375, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "entropy": 0.23094411194324493, "epoch": 0.5464743589743589, "frac_reward_zero_std": 0.0, "grad_norm": 853.015625, "learning_rate": 1e-06, "loss": 0.3191, "num_tokens": 337591040.0, "reward": 0.1426115334033966, "reward_std": 0.17017614841461182, "rewards/progression_diversity/mean": -0.10945233702659607, "rewards/progression_diversity/std": 0.18531769514083862, "rewards/symbolic_reward_accuracy/mean": 0.064453125, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.4288899898529053, "rewards/symbolic_reward_partial_score/std": 0.330017626285553, "rewards/tag_count_reward/mean": -0.236328125, "rewards/tag_count_reward/std": 0.42524150013923645, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0073082447052002, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 13.80842399597168, "step": 341 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.2576480060815811, "epoch": 0.5480769230769231, "grad_norm": 14.71423053741455, "learning_rate": 1e-06, "loss": 0.2245, "step": 342 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.3515625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.2539537325501442, "epoch": 0.5496794871794872, "grad_norm": 0.01885385625064373, "learning_rate": 1e-06, "loss": 0.2631, "step": 343 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.2588704973459244, "epoch": 0.5512820512820513, "grad_norm": 0.022954288870096207, "learning_rate": 1e-06, "loss": 0.2359, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16384.0, "completions/max_terminated_length": 5730.0, "completions/mean_length": 5692.001953125, "completions/mean_terminated_length": 2128.002685546875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "entropy": 0.28620630502700806, "epoch": 0.5528846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 664.2322998046875, "learning_rate": 1e-06, "loss": 0.2053, "num_tokens": 341301089.0, "reward": 0.21951040625572205, "reward_std": 0.22114171087741852, "rewards/progression_diversity/mean": -0.09241549670696259, "rewards/progression_diversity/std": 0.17231649160385132, "rewards/symbolic_reward_accuracy/mean": 0.1484375, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.5049642324447632, "rewards/symbolic_reward_partial_score/std": 0.3561265468597412, "rewards/tag_count_reward/mean": -0.201171875, "rewards/tag_count_reward/std": 0.4012683033943176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0092263221740723, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 13.769186019897461, "step": 345 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.2516386955976486, "epoch": 0.5544871794871795, "grad_norm": 547.4846801757812, "learning_rate": 1e-06, "loss": 0.3033, "step": 346 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.2582557052373886, "epoch": 0.5560897435897436, "grad_norm": 0.3687719404697418, "learning_rate": 1e-06, "loss": 0.2881, "step": 347 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.25732411444187164, "epoch": 0.5576923076923077, "grad_norm": 0.022600045427680016, "learning_rate": 1e-06, "loss": 0.1909, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5114.0, "completions/mean_length": 5463.091796875, "completions/mean_terminated_length": 2192.36279296875, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "entropy": 0.2584301382303238, "epoch": 0.5592948717948718, "frac_reward_zero_std": 0.0, "grad_norm": 1072.4036865234375, "learning_rate": 1e-06, "loss": 0.2856, "num_tokens": 344947984.0, "reward": 0.17411813139915466, "reward_std": 0.17677854001522064, "rewards/progression_diversity/mean": -0.08086328208446503, "rewards/progression_diversity/std": 0.1584118902683258, "rewards/symbolic_reward_accuracy/mean": 0.083984375, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.4769693911075592, "rewards/symbolic_reward_partial_score/std": 0.3174276053905487, "rewards/tag_count_reward/mean": -0.185546875, "rewards/tag_count_reward/std": 0.38912075757980347, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.015385389328003, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 12.646538734436035, "step": 349 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.28206463158130646, "epoch": 0.5608974358974359, "grad_norm": 23.78731918334961, "learning_rate": 1e-06, "loss": 0.2093, "step": 350 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2668316960334778, "epoch": 0.5625, "grad_norm": 0.019982578232884407, "learning_rate": 1e-06, "loss": 0.2188, "step": 351 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.27186111360788345, "epoch": 0.5641025641025641, "grad_norm": 0.04332799091935158, "learning_rate": 1e-06, "loss": 0.2327, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5015.0, "completions/mean_length": 5713.78515625, "completions/mean_terminated_length": 2045.0235595703125, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 0.24826274812221527, "epoch": 0.5657051282051282, "frac_reward_zero_std": 0.0, "grad_norm": 2501.171875, "learning_rate": 1e-06, "loss": 0.2487, "num_tokens": 348796562.0, "reward": 0.2261546105146408, "reward_std": 0.22966080904006958, "rewards/progression_diversity/mean": -0.09401117265224457, "rewards/progression_diversity/std": 0.1731572151184082, "rewards/symbolic_reward_accuracy/mean": 0.166015625, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.4926595091819763, "rewards/symbolic_reward_partial_score/std": 0.3556332588195801, "rewards/tag_count_reward/mean": -0.203125, "rewards/tag_count_reward/std": 0.4027182459831238, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0071918964385986, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 14.334271430969238, "step": 353 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.23615773022174835, "epoch": 0.5673076923076923, "grad_norm": 180.40103149414062, "learning_rate": 1e-06, "loss": 0.3324, "step": 354 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.2617553621530533, "epoch": 0.5689102564102564, "grad_norm": 15.31356143951416, "learning_rate": 1e-06, "loss": 0.2533, "step": 355 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.28039678931236267, "epoch": 0.5705128205128205, "grad_norm": 0.03650696948170662, "learning_rate": 1e-06, "loss": 0.209, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.30859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6289.0, "completions/mean_length": 6547.625, "completions/mean_terminated_length": 2157.37841796875, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "entropy": 0.2463904321193695, "epoch": 0.5721153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 776.6242065429688, "learning_rate": 1e-06, "loss": 0.2591, "num_tokens": 353049346.0, "reward": 0.15728139877319336, "reward_std": 0.19684484601020813, "rewards/progression_diversity/mean": -0.11365720629692078, "rewards/progression_diversity/std": 0.1853574961423874, "rewards/symbolic_reward_accuracy/mean": 0.083984375, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.4564453363418579, "rewards/symbolic_reward_partial_score/std": 0.3446153700351715, "rewards/tag_count_reward/mean": -0.2890625, "rewards/tag_count_reward/std": 0.45377036929130554, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0021424293518066, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 15.470850944519043, "step": 357 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.23972290009260178, "epoch": 0.5737179487179487, "grad_norm": 157.12100219726562, "learning_rate": 1e-06, "loss": 0.2861, "step": 358 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.2611324340105057, "epoch": 0.5753205128205128, "grad_norm": 0.026206420734524727, "learning_rate": 1e-06, "loss": 0.2928, "step": 359 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6328125, "entropy": 0.2634492665529251, "epoch": 0.5769230769230769, "grad_norm": 6.0313310623168945, "learning_rate": 1e-06, "loss": 0.346, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5884.0, "completions/mean_length": 6286.490234375, "completions/mean_terminated_length": 2180.909423828125, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "entropy": 0.2577165514230728, "epoch": 0.5785256410256411, "frac_reward_zero_std": 0.0, "grad_norm": 268.4336853027344, "learning_rate": 1e-06, "loss": 0.2247, "num_tokens": 357130125.0, "reward": 0.1658135950565338, "reward_std": 0.19578394293785095, "rewards/progression_diversity/mean": -0.116884745657444, "rewards/progression_diversity/std": 0.19544658064842224, "rewards/symbolic_reward_accuracy/mean": 0.09375, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.45634764432907104, "rewards/symbolic_reward_partial_score/std": 0.3385258913040161, "rewards/tag_count_reward/mean": -0.26171875, "rewards/tag_count_reward/std": 0.44000017642974854, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001070261001587, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 15.70700454711914, "step": 361 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.21536707878112793, "epoch": 0.5801282051282052, "grad_norm": 2007.2310791015625, "learning_rate": 1e-06, "loss": 0.3851, "step": 362 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.59375, "entropy": 0.23036614060401917, "epoch": 0.5817307692307693, "grad_norm": 188.44676208496094, "learning_rate": 1e-06, "loss": 0.3252, "step": 363 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5859375, "entropy": 0.24637820571660995, "epoch": 0.5833333333333334, "grad_norm": 0.025239424780011177, "learning_rate": 1e-06, "loss": 0.2805, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5414.0, "completions/mean_length": 7036.28125, "completions/mean_terminated_length": 2307.435302734375, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "entropy": 0.2505442202091217, "epoch": 0.5849358974358975, "frac_reward_zero_std": 0.0, "grad_norm": 329.7117614746094, "learning_rate": 1e-06, "loss": 0.1805, "num_tokens": 361515437.0, "reward": 0.15802721679210663, "reward_std": 0.16592083871364594, "rewards/progression_diversity/mean": -0.14015081524848938, "rewards/progression_diversity/std": 0.20965632796287537, "rewards/symbolic_reward_accuracy/mean": 0.080078125, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.47153323888778687, "rewards/symbolic_reward_partial_score/std": 0.3264097571372986, "rewards/tag_count_reward/mean": -0.30078125, "rewards/tag_count_reward/std": 0.45904624462127686, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9985228180885315, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 15.96454906463623, "step": 365 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.21816373616456985, "epoch": 0.5865384615384616, "grad_norm": 1167.26611328125, "learning_rate": 1e-06, "loss": 0.2898, "step": 366 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.3828125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6015625, "entropy": 0.2148098573088646, "epoch": 0.5881410256410257, "grad_norm": 8.457894325256348, "learning_rate": 1e-06, "loss": 0.3274, "step": 367 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.65625, "entropy": 0.2301519215106964, "epoch": 0.5897435897435898, "grad_norm": 40.98564147949219, "learning_rate": 1e-06, "loss": 0.2575, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.380859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5608.0, "completions/mean_length": 7724.8984375, "completions/mean_terminated_length": 2398.32177734375, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "entropy": 0.20847928524017334, "epoch": 0.5913461538461539, "frac_reward_zero_std": 0.0, "grad_norm": 825.861083984375, "learning_rate": 1e-06, "loss": 0.2632, "num_tokens": 366384745.0, "reward": 0.16988316178321838, "reward_std": 0.19900578260421753, "rewards/progression_diversity/mean": -0.1527988612651825, "rewards/progression_diversity/std": 0.21360373497009277, "rewards/symbolic_reward_accuracy/mean": 0.123046875, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.4359537959098816, "rewards/symbolic_reward_partial_score/std": 0.35540899634361267, "rewards/tag_count_reward/mean": -0.33203125, "rewards/tag_count_reward/std": 0.47140273451805115, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913198947906494, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 16.873680114746094, "step": 369 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.1968770995736122, "epoch": 0.592948717948718, "grad_norm": 528.3468627929688, "learning_rate": 1e-06, "loss": 0.3157, "step": 370 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.3515625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.19093196094036102, "epoch": 0.594551282051282, "grad_norm": 962.9395751953125, "learning_rate": 1e-06, "loss": 0.3246, "step": 371 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.20564716309309006, "epoch": 0.5961538461538461, "grad_norm": 17.06043243408203, "learning_rate": 1e-06, "loss": 0.281, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6353.0, "completions/mean_length": 6398.890625, "completions/mean_terminated_length": 2339.010986328125, "completions/min_length": 467.0, "completions/min_terminated_length": 467.0, "entropy": 0.25001922249794006, "epoch": 0.5977564102564102, "frac_reward_zero_std": 0.0, "grad_norm": 419.783447265625, "learning_rate": 1e-06, "loss": 0.2007, "num_tokens": 370463153.0, "reward": 0.15392863750457764, "reward_std": 0.17637325823307037, "rewards/progression_diversity/mean": -0.11885648965835571, "rewards/progression_diversity/std": 0.2015119045972824, "rewards/symbolic_reward_accuracy/mean": 0.052734375, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.5014322996139526, "rewards/symbolic_reward_partial_score/std": 0.3110915422439575, "rewards/tag_count_reward/mean": -0.26953125, "rewards/tag_count_reward/std": 0.44415023922920227, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9979949593544006, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 16.023746490478516, "step": 373 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2377210631966591, "epoch": 0.5993589743589743, "grad_norm": 631.596435546875, "learning_rate": 1e-06, "loss": 0.3175, "step": 374 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.2239432856440544, "epoch": 0.6009615384615384, "grad_norm": 676.9954223632812, "learning_rate": 1e-06, "loss": 0.307, "step": 375 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.2474696934223175, "epoch": 0.6025641025641025, "grad_norm": 75.13766479492188, "learning_rate": 1e-06, "loss": 0.207, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6881.0, "completions/mean_length": 6655.6796875, "completions/mean_terminated_length": 2392.7080078125, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 0.2293773591518402, "epoch": 0.6041666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 938.2958984375, "learning_rate": 1e-06, "loss": 0.2922, "num_tokens": 374795517.0, "reward": 0.16000205278396606, "reward_std": 0.18099814653396606, "rewards/progression_diversity/mean": -0.11991134285926819, "rewards/progression_diversity/std": 0.19749855995178223, "rewards/symbolic_reward_accuracy/mean": 0.08203125, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.4598633050918579, "rewards/symbolic_reward_partial_score/std": 0.3230631649494171, "rewards/tag_count_reward/mean": -0.259765625, "rewards/tag_count_reward/std": 0.4389347732067108, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0005474090576172, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 15.338663101196289, "step": 377 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.2615222707390785, "epoch": 0.6057692307692307, "grad_norm": 1569.6458740234375, "learning_rate": 1e-06, "loss": 0.1825, "step": 378 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2232813537120819, "epoch": 0.6073717948717948, "grad_norm": 432.9651184082031, "learning_rate": 1e-06, "loss": 0.3715, "step": 379 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.24861778318881989, "epoch": 0.6089743589743589, "grad_norm": 11.820572853088379, "learning_rate": 1e-06, "loss": 0.2359, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.31640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5300.0, "completions/mean_length": 6716.076171875, "completions/mean_terminated_length": 2241.20849609375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.2513297498226166, "epoch": 0.6105769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 598.6245727539062, "learning_rate": 1e-06, "loss": 0.1991, "num_tokens": 379059012.0, "reward": 0.2126418650150299, "reward_std": 0.20646947622299194, "rewards/progression_diversity/mean": -0.1269288957118988, "rewards/progression_diversity/std": 0.20075669884681702, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.4962402582168579, "rewards/symbolic_reward_partial_score/std": 0.35096409916877747, "rewards/tag_count_reward/mean": -0.287109375, "rewards/tag_count_reward/std": 0.45285552740097046, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9986449480056763, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 15.811168670654297, "step": 381 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2153405398130417, "epoch": 0.6121794871794872, "grad_norm": 698.4373779296875, "learning_rate": 1e-06, "loss": 0.4257, "step": 382 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.2359524369239807, "epoch": 0.6137820512820513, "grad_norm": 251.32823181152344, "learning_rate": 1e-06, "loss": 0.2656, "step": 383 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.2591070830821991, "epoch": 0.6153846153846154, "grad_norm": 1601.9156494140625, "learning_rate": 1e-06, "loss": 0.1929, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16384.0, "completions/max_terminated_length": 13378.0, "completions/mean_length": 5822.728515625, "completions/mean_terminated_length": 2302.3046875, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "entropy": 0.27983543276786804, "epoch": 0.6169871794871795, "frac_reward_zero_std": 0.0, "grad_norm": 612.2117919921875, "learning_rate": 1e-06, "loss": 0.1586, "num_tokens": 382808633.0, "reward": 0.22261884808540344, "reward_std": 0.21902015805244446, "rewards/progression_diversity/mean": -0.10237417370080948, "rewards/progression_diversity/std": 0.1874011605978012, "rewards/symbolic_reward_accuracy/mean": 0.150390625, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.522167980670929, "rewards/symbolic_reward_partial_score/std": 0.3341177999973297, "rewards/tag_count_reward/mean": -0.232421875, "rewards/tag_count_reward/std": 0.42278963327407837, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0090166330337524, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 13.79207706451416, "step": 385 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.24246982485055923, "epoch": 0.6185897435897436, "grad_norm": 324.906005859375, "learning_rate": 1e-06, "loss": 0.3371, "step": 386 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2653020918369293, "epoch": 0.6201923076923077, "grad_norm": 195.28736877441406, "learning_rate": 1e-06, "loss": 0.2096, "step": 387 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.2655164748430252, "epoch": 0.6217948717948718, "grad_norm": 68.55646514892578, "learning_rate": 1e-06, "loss": 0.2088, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.33984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12824.0, "completions/mean_length": 7204.025390625, "completions/mean_terminated_length": 2478.23974609375, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "entropy": 0.21731575578451157, "epoch": 0.6233974358974359, "frac_reward_zero_std": 0.0, "grad_norm": 1553.82080078125, "learning_rate": 1e-06, "loss": 0.3047, "num_tokens": 387345190.0, "reward": 0.16609683632850647, "reward_std": 0.20702703297138214, "rewards/progression_diversity/mean": -0.1368994563817978, "rewards/progression_diversity/std": 0.20169976353645325, "rewards/symbolic_reward_accuracy/mean": 0.10546875, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.44949543476104736, "rewards/symbolic_reward_partial_score/std": 0.3339717984199524, "rewards/tag_count_reward/mean": -0.306640625, "rewards/tag_count_reward/std": 0.4615498185157776, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9934464693069458, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 16.535369873046875, "step": 389 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.4921875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.65625, "entropy": 0.1892649158835411, "epoch": 0.625, "grad_norm": 104.23577880859375, "learning_rate": 1e-06, "loss": 0.348, "step": 390 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.24794911593198776, "epoch": 0.6266025641025641, "grad_norm": 0.032321542501449585, "learning_rate": 1e-06, "loss": 0.1668, "step": 391 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.578125, "entropy": 0.21500948816537857, "epoch": 0.6282051282051282, "grad_norm": 1.6792036294937134, "learning_rate": 1e-06, "loss": 0.2748, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.314453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13686.0, "completions/mean_length": 6848.802734375, "completions/mean_terminated_length": 2475.108154296875, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.23190298676490784, "epoch": 0.6298076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 609.8284301757812, "learning_rate": 1e-06, "loss": 0.2339, "num_tokens": 391783393.0, "reward": 0.14935705065727234, "reward_std": 0.17916199564933777, "rewards/progression_diversity/mean": -0.1165420189499855, "rewards/progression_diversity/std": 0.18923556804656982, "rewards/symbolic_reward_accuracy/mean": 0.078125, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.44054362177848816, "rewards/symbolic_reward_partial_score/std": 0.3393227458000183, "rewards/tag_count_reward/mean": -0.28515625, "rewards/tag_count_reward/std": 0.45193037390708923, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.002633810043335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 14.761566162109375, "step": 393 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2784353643655777, "epoch": 0.6314102564102564, "grad_norm": 262.9246520996094, "learning_rate": 1e-06, "loss": 0.1924, "step": 394 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.3984375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.65625, "entropy": 0.21299445629119873, "epoch": 0.6330128205128205, "grad_norm": 244.2508087158203, "learning_rate": 1e-06, "loss": 0.3124, "step": 395 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.65625, "entropy": 0.22290880978107452, "epoch": 0.6346153846153846, "grad_norm": 0.022922182455658913, "learning_rate": 1e-06, "loss": 0.3096, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.322265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6123.0, "completions/mean_length": 6933.5625, "completions/mean_terminated_length": 2439.838623046875, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "entropy": 0.25610800087451935, "epoch": 0.6362179487179487, "frac_reward_zero_std": 0.0, "grad_norm": 492.3603210449219, "learning_rate": 1e-06, "loss": 0.2415, "num_tokens": 396120849.0, "reward": 0.14940249919891357, "reward_std": 0.18980038166046143, "rewards/progression_diversity/mean": -0.124204620718956, "rewards/progression_diversity/std": 0.19669218361377716, "rewards/symbolic_reward_accuracy/mean": 0.08203125, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.43704426288604736, "rewards/symbolic_reward_partial_score/std": 0.3477852940559387, "rewards/tag_count_reward/mean": -0.296875, "rewards/tag_count_reward/std": 0.45732781291007996, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0012847185134888, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 15.557909965515137, "step": 397 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.23665452748537064, "epoch": 0.6378205128205128, "grad_norm": 1791.0264892578125, "learning_rate": 1e-06, "loss": 0.2836, "step": 398 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5625, "entropy": 0.2314099296927452, "epoch": 0.6394230769230769, "grad_norm": 193.3421630859375, "learning_rate": 1e-06, "loss": 0.3197, "step": 399 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.24476408958435059, "epoch": 0.6410256410256411, "grad_norm": 7.516962051391602, "learning_rate": 1e-06, "loss": 0.2566, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6095.0, "completions/mean_length": 6450.169921875, "completions/mean_terminated_length": 2411.14013671875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.23504739999771118, "epoch": 0.6426282051282052, "frac_reward_zero_std": 0.0, "grad_norm": 792.7737426757812, "learning_rate": 1e-06, "loss": 0.306, "num_tokens": 400280488.0, "reward": 0.18703122437000275, "reward_std": 0.192929208278656, "rewards/progression_diversity/mean": -0.10351836681365967, "rewards/progression_diversity/std": 0.17522378265857697, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.4745442867279053, "rewards/symbolic_reward_partial_score/std": 0.3318890631198883, "rewards/tag_count_reward/mean": -0.24609375, "rewards/tag_count_reward/std": 0.4311550557613373, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0084717273712158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 14.434258460998535, "step": 401 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.25011158734560013, "epoch": 0.6442307692307693, "grad_norm": 317.9718322753906, "learning_rate": 1e-06, "loss": 0.2485, "step": 402 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.261634424328804, "epoch": 0.6458333333333334, "grad_norm": 102.99897766113281, "learning_rate": 1e-06, "loss": 0.2611, "step": 403 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.2772502601146698, "epoch": 0.6474358974358975, "grad_norm": 31.346302032470703, "learning_rate": 1e-06, "loss": 0.1701, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7054.0, "completions/mean_length": 6076.56640625, "completions/mean_terminated_length": 2532.540771484375, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "entropy": 0.2811954766511917, "epoch": 0.6490384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 863.798828125, "learning_rate": 1e-06, "loss": 0.2297, "num_tokens": 404250186.0, "reward": 0.1544865369796753, "reward_std": 0.17089833319187164, "rewards/progression_diversity/mean": -0.08845682442188263, "rewards/progression_diversity/std": 0.16534774005413055, "rewards/symbolic_reward_accuracy/mean": 0.05078125, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.4886067509651184, "rewards/symbolic_reward_partial_score/std": 0.3330814838409424, "rewards/tag_count_reward/mean": -0.216796875, "rewards/tag_count_reward/std": 0.4124660789966583, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0118536949157715, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 13.839906692504883, "step": 405 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.269717812538147, "epoch": 0.6506410256410257, "grad_norm": 659.154052734375, "learning_rate": 1e-06, "loss": 0.2937, "step": 406 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.25652148574590683, "epoch": 0.6522435897435898, "grad_norm": 0.023287350311875343, "learning_rate": 1e-06, "loss": 0.2854, "step": 407 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3671875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6015625, "entropy": 0.2793606072664261, "epoch": 0.6538461538461539, "grad_norm": 0.029362130910158157, "learning_rate": 1e-06, "loss": 0.2409, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7332.0, "completions/mean_length": 6744.724609375, "completions/mean_terminated_length": 2674.808349609375, "completions/min_length": 504.0, "completions/min_terminated_length": 504.0, "entropy": 0.23652763664722443, "epoch": 0.655448717948718, "frac_reward_zero_std": 0.0, "grad_norm": 488.6410827636719, "learning_rate": 1e-06, "loss": 0.3058, "num_tokens": 408669469.0, "reward": 0.1471533626317978, "reward_std": 0.18105512857437134, "rewards/progression_diversity/mean": -0.10253537446260452, "rewards/progression_diversity/std": 0.17895787954330444, "rewards/symbolic_reward_accuracy/mean": 0.076171875, "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, "rewards/symbolic_reward_partial_score/mean": 0.42687174677848816, "rewards/symbolic_reward_partial_score/std": 0.3458375334739685, "rewards/tag_count_reward/mean": -0.255859375, "rewards/tag_count_reward/std": 0.43676990270614624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0076942443847656, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 520.0, "sampling/sampling_logp_difference/mean": 13.969697952270508, "step": 409 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.2513129413127899, "epoch": 0.657051282051282, "grad_norm": 781.8362426757812, "learning_rate": 1e-06, "loss": 0.3877, "step": 410 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.640625, "entropy": 0.2599927484989166, "epoch": 0.6586538461538461, "grad_norm": 8.609899520874023, "learning_rate": 1e-06, "loss": 0.2786, "step": 411 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.3515625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.23813901096582413, "epoch": 0.6602564102564102, "grad_norm": 0.03463734686374664, "learning_rate": 1e-06, "loss": 0.3067, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.248046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13444.0, "completions/mean_length": 6043.830078125, "completions/mean_terminated_length": 2632.916748046875, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "entropy": 0.2797359824180603, "epoch": 0.6618589743589743, "frac_reward_zero_std": 0.0, "grad_norm": 446.9456787109375, "learning_rate": 1e-06, "loss": 0.2367, "num_tokens": 412617750.0, "reward": 0.1771666407585144, "reward_std": 0.18979674577713013, "rewards/progression_diversity/mean": -0.08070126920938492, "rewards/progression_diversity/std": 0.15634961426258087, "rewards/symbolic_reward_accuracy/mean": 0.09375, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.4754069149494171, "rewards/symbolic_reward_partial_score/std": 0.336232453584671, "rewards/tag_count_reward/mean": -0.208984375, "rewards/tag_count_reward/std": 0.40698084235191345, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0141136646270752, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 524.0, "sampling/sampling_logp_difference/mean": 12.855762481689453, "step": 413 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2539880573749542, "epoch": 0.6634615384615384, "grad_norm": 368.5082092285156, "learning_rate": 1e-06, "loss": 0.3098, "step": 414 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2531498521566391, "epoch": 0.6650641025641025, "grad_norm": 162.2779541015625, "learning_rate": 1e-06, "loss": 0.2807, "step": 415 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.2950977236032486, "epoch": 0.6666666666666666, "grad_norm": 129.06211853027344, "learning_rate": 1e-06, "loss": 0.1478, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 12448.0, "completions/mean_length": 5337.125, "completions/mean_terminated_length": 2588.8779296875, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "entropy": 0.28342205286026, "epoch": 0.6682692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 795.7291259765625, "learning_rate": 1e-06, "loss": 0.2281, "num_tokens": 416218758.0, "reward": 0.18160498142242432, "reward_std": 0.17491479218006134, "rewards/progression_diversity/mean": -0.06606545299291611, "rewards/progression_diversity/std": 0.14889149367809296, "rewards/symbolic_reward_accuracy/mean": 0.087890625, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.48906251788139343, "rewards/symbolic_reward_partial_score/std": 0.3222104609012604, "rewards/tag_count_reward/mean": -0.171875, "rewards/tag_count_reward/std": 0.3776407241821289, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0209629535675049, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 524.0, "sampling/sampling_logp_difference/mean": 11.58393383026123, "step": 417 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.29566076397895813, "epoch": 0.6698717948717948, "grad_norm": 879.3897094726562, "learning_rate": 1e-06, "loss": 0.2653, "step": 418 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.2756947875022888, "epoch": 0.6714743589743589, "grad_norm": 92.3701400756836, "learning_rate": 1e-06, "loss": 0.2869, "step": 419 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.31204286217689514, "epoch": 0.6730769230769231, "grad_norm": 0.028281621634960175, "learning_rate": 1e-06, "loss": 0.1507, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6618.0, "completions/mean_length": 5795.564453125, "completions/mean_terminated_length": 2693.901611328125, "completions/min_length": 476.0, "completions/min_terminated_length": 476.0, "entropy": 0.28321386873722076, "epoch": 0.6746794871794872, "frac_reward_zero_std": 0.0, "grad_norm": 1091.48193359375, "learning_rate": 1e-06, "loss": 0.2515, "num_tokens": 420108631.0, "reward": 0.20123516023159027, "reward_std": 0.18499191105365753, "rewards/progression_diversity/mean": -0.06691451370716095, "rewards/progression_diversity/std": 0.14319713413715363, "rewards/symbolic_reward_accuracy/mean": 0.130859375, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.47119140625, "rewards/symbolic_reward_partial_score/std": 0.3467746376991272, "rewards/tag_count_reward/mean": -0.1796875, "rewards/tag_count_reward/std": 0.38430243730545044, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0261237621307373, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 524.0, "sampling/sampling_logp_difference/mean": 10.4873685836792, "step": 421 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.29514726996421814, "epoch": 0.6762820512820513, "grad_norm": 0.027155712246894836, "learning_rate": 1e-06, "loss": 0.1629, "step": 422 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.27398838102817535, "epoch": 0.6778846153846154, "grad_norm": 0.03398658335208893, "learning_rate": 1e-06, "loss": 0.2472, "step": 423 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.28561370074748993, "epoch": 0.6794871794871795, "grad_norm": 0.03232736140489578, "learning_rate": 1e-06, "loss": 0.1731, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8604.0, "completions/mean_length": 6224.42578125, "completions/mean_terminated_length": 2766.97900390625, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "entropy": 0.28223684430122375, "epoch": 0.6810897435897436, "frac_reward_zero_std": 0.03125, "grad_norm": 1497.0281982421875, "learning_rate": 1e-06, "loss": 0.2344, "num_tokens": 424132385.0, "reward": 0.1739288568496704, "reward_std": 0.19503268599510193, "rewards/progression_diversity/mean": -0.07684145867824554, "rewards/progression_diversity/std": 0.15114691853523254, "rewards/symbolic_reward_accuracy/mean": 0.09375, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.46123045682907104, "rewards/symbolic_reward_partial_score/std": 0.3385530114173889, "rewards/tag_count_reward/mean": -0.19921875, "rewards/tag_count_reward/std": 0.39980348944664, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.015244483947754, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 524.0, "sampling/sampling_logp_difference/mean": 12.87832260131836, "step": 425 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.2601533979177475, "epoch": 0.6826923076923077, "grad_norm": 5.701667785644531, "learning_rate": 1e-06, "loss": 0.2746, "step": 426 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.28981634974479675, "epoch": 0.6842948717948718, "grad_norm": 0.028896579518914223, "learning_rate": 1e-06, "loss": 0.1887, "step": 427 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.3828125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.268518328666687, "epoch": 0.6858974358974359, "grad_norm": 0.03334295004606247, "learning_rate": 1e-06, "loss": 0.2642, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.267578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8233.0, "completions/mean_length": 6416.837890625, "completions/mean_terminated_length": 2775.501220703125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.26624637842178345, "epoch": 0.6875, "frac_reward_zero_std": 0.0, "grad_norm": 960.3082885742188, "learning_rate": 1e-06, "loss": 0.2586, "num_tokens": 428286878.0, "reward": 0.19910672307014465, "reward_std": 0.20500460267066956, "rewards/progression_diversity/mean": -0.07907611131668091, "rewards/progression_diversity/std": 0.1532817780971527, "rewards/symbolic_reward_accuracy/mean": 0.130859375, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.4729654788970947, "rewards/symbolic_reward_partial_score/std": 0.36421647667884827, "rewards/tag_count_reward/mean": -0.205078125, "rewards/tag_count_reward/std": 0.4041535556316376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0138148069381714, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 524.0, "sampling/sampling_logp_difference/mean": 12.851956367492676, "step": 429 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2550949156284332, "epoch": 0.6891025641025641, "grad_norm": 0.16806720197200775, "learning_rate": 1e-06, "loss": 0.2759, "step": 430 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.28539833426475525, "epoch": 0.6907051282051282, "grad_norm": 0.027446668595075607, "learning_rate": 1e-06, "loss": 0.2034, "step": 431 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5703125, "entropy": 0.2528308480978012, "epoch": 0.6923076923076923, "grad_norm": 0.4935199022293091, "learning_rate": 1e-06, "loss": 0.2769, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.22265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6021.0, "completions/mean_length": 5849.77734375, "completions/mean_terminated_length": 2832.437255859375, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "entropy": 0.2618635445833206, "epoch": 0.6939102564102564, "frac_reward_zero_std": 0.0, "grad_norm": 2757.423828125, "learning_rate": 1e-06, "loss": 0.2911, "num_tokens": 432106236.0, "reward": 0.14727163314819336, "reward_std": 0.16309772431850433, "rewards/progression_diversity/mean": -0.06434153020381927, "rewards/progression_diversity/std": 0.13670898973941803, "rewards/symbolic_reward_accuracy/mean": 0.0546875, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.44357097148895264, "rewards/symbolic_reward_partial_score/std": 0.31754741072654724, "rewards/tag_count_reward/mean": -0.1796875, "rewards/tag_count_reward/std": 0.38430243730545044, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.018375039100647, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 524.0, "sampling/sampling_logp_difference/mean": 12.233776092529297, "step": 433 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.2820604145526886, "epoch": 0.6955128205128205, "grad_norm": 319.56304931640625, "learning_rate": 1e-06, "loss": 0.2209, "step": 434 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.2742318958044052, "epoch": 0.6971153846153846, "grad_norm": 0.03161383047699928, "learning_rate": 1e-06, "loss": 0.232, "step": 435 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.28273947536945343, "epoch": 0.6987179487179487, "grad_norm": 0.10165081173181534, "learning_rate": 1e-06, "loss": 0.194, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13592.0, "completions/mean_length": 5897.6796875, "completions/mean_terminated_length": 2825.92919921875, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "entropy": 0.2722629904747009, "epoch": 0.7003205128205128, "frac_reward_zero_std": 0.0, "grad_norm": 247.05191040039062, "learning_rate": 1e-06, "loss": 0.2686, "num_tokens": 435945464.0, "reward": 0.17971304059028625, "reward_std": 0.16216953098773956, "rewards/progression_diversity/mean": -0.043344639241695404, "rewards/progression_diversity/std": 0.10168621689081192, "rewards/symbolic_reward_accuracy/mean": 0.08984375, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.4813476502895355, "rewards/symbolic_reward_partial_score/std": 0.332892507314682, "rewards/tag_count_reward/mean": -0.181640625, "rewards/tag_count_reward/std": 0.38592514395713806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0249509811401367, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 524.0, "sampling/sampling_logp_difference/mean": 9.850525856018066, "step": 437 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.2895769029855728, "epoch": 0.7019230769230769, "grad_norm": 77.53079986572266, "learning_rate": 1e-06, "loss": 0.2535, "step": 438 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3120630830526352, "epoch": 0.7035256410256411, "grad_norm": 0.035726454108953476, "learning_rate": 1e-06, "loss": 0.2106, "step": 439 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3188182860612869, "epoch": 0.7051282051282052, "grad_norm": 26.437782287597656, "learning_rate": 1e-06, "loss": 0.2773, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15335.0, "completions/mean_length": 5722.31640625, "completions/mean_terminated_length": 2938.72412109375, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "entropy": 0.31573159992694855, "epoch": 0.7067307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 219.33189392089844, "learning_rate": 1e-06, "loss": 0.2088, "num_tokens": 439743898.0, "reward": 0.16550663113594055, "reward_std": 0.17046445608139038, "rewards/progression_diversity/mean": -0.00011906892905244604, "rewards/progression_diversity/std": 0.002519554691389203, "rewards/symbolic_reward_accuracy/mean": 0.068359375, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.4709635376930237, "rewards/symbolic_reward_partial_score/std": 0.31855061650276184, "rewards/tag_count_reward/mean": -0.16796875, "rewards/tag_count_reward/std": 0.374204158782959, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0505764484405518, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 1.717551350593567, "step": 441 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.3088010251522064, "epoch": 0.7083333333333334, "grad_norm": 0.03410027176141739, "learning_rate": 1e-06, "loss": 0.1864, "step": 442 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.3080977499485016, "epoch": 0.7099358974358975, "grad_norm": 0.03793443366885185, "learning_rate": 1e-06, "loss": 0.2849, "step": 443 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.3152337819337845, "epoch": 0.7115384615384616, "grad_norm": 0.02681701071560383, "learning_rate": 1e-06, "loss": 0.2083, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.240234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7462.0, "completions/mean_length": 6347.04296875, "completions/mean_terminated_length": 3173.403564453125, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "entropy": 0.2724430710077286, "epoch": 0.7131410256410257, "frac_reward_zero_std": 0.0, "grad_norm": 317.01702880859375, "learning_rate": 1e-06, "loss": 0.2599, "num_tokens": 443837808.0, "reward": 0.15419574081897736, "reward_std": 0.1597881019115448, "rewards/progression_diversity/mean": -0.0023014158941805363, "rewards/progression_diversity/std": 0.015193020924925804, "rewards/symbolic_reward_accuracy/mean": 0.060546875, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.4554687738418579, "rewards/symbolic_reward_partial_score/std": 0.3217701315879822, "rewards/tag_count_reward/mean": -0.1875, "rewards/tag_count_reward/std": 0.39069411158561707, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0374995470046997, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 482.0, "sampling/sampling_logp_difference/mean": 2.7787814140319824, "step": 445 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.3045268654823303, "epoch": 0.7147435897435898, "grad_norm": 0.030019085854291916, "learning_rate": 1e-06, "loss": 0.178, "step": 446 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.26429812610149384, "epoch": 0.7163461538461539, "grad_norm": 0.038714099675416946, "learning_rate": 1e-06, "loss": 0.3328, "step": 447 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.28812122344970703, "epoch": 0.717948717948718, "grad_norm": 0.03327601030468941, "learning_rate": 1e-06, "loss": 0.2087, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 16384.0, "completions/max_terminated_length": 8403.0, "completions/mean_length": 6429.9140625, "completions/mean_terminated_length": 3111.885498046875, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "entropy": 0.27920031547546387, "epoch": 0.719551282051282, "frac_reward_zero_std": 0.0, "grad_norm": 255.1264190673828, "learning_rate": 1e-06, "loss": 0.2454, "num_tokens": 448087220.0, "reward": 0.14131547510623932, "reward_std": 0.1714935302734375, "rewards/progression_diversity/mean": -0.005660384893417358, "rewards/progression_diversity/std": 0.01929020695388317, "rewards/symbolic_reward_accuracy/mean": 0.052734375, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.43413084745407104, "rewards/symbolic_reward_partial_score/std": 0.3264063894748688, "rewards/tag_count_reward/mean": -0.205078125, "rewards/tag_count_reward/std": 0.4041535556316376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0356756448745728, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 454.0, "sampling/sampling_logp_difference/mean": 3.235456943511963, "step": 449 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.27578917145729065, "epoch": 0.7211538461538461, "grad_norm": 1.0962835550308228, "learning_rate": 1e-06, "loss": 0.2381, "step": 450 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.2769124358892441, "epoch": 0.7227564102564102, "grad_norm": 0.03778436407446861, "learning_rate": 1e-06, "loss": 0.2373, "step": 451 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.25797323882579803, "epoch": 0.7243589743589743, "grad_norm": 0.07790114730596542, "learning_rate": 1e-06, "loss": 0.2823, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13926.0, "completions/mean_length": 6445.306640625, "completions/mean_terminated_length": 3468.74365234375, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "entropy": 0.2812911868095398, "epoch": 0.7259615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 242.18006896972656, "learning_rate": 1e-06, "loss": 0.2006, "num_tokens": 452275297.0, "reward": 0.14035208523273468, "reward_std": 0.1568242758512497, "rewards/progression_diversity/mean": -0.00873718224465847, "rewards/progression_diversity/std": 0.02793550305068493, "rewards/symbolic_reward_accuracy/mean": 0.0546875, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.42190754413604736, "rewards/symbolic_reward_partial_score/std": 0.31549105048179626, "rewards/tag_count_reward/mean": -0.189453125, "rewards/tag_count_reward/std": 0.3922513723373413, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0444124937057495, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 400.0, "sampling/sampling_logp_difference/mean": 3.6778717041015625, "step": 453 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2804671823978424, "epoch": 0.7275641025641025, "grad_norm": 0.040219251066446304, "learning_rate": 1e-06, "loss": 0.1904, "step": 454 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.26130877435207367, "epoch": 0.7291666666666666, "grad_norm": 0.05419059470295906, "learning_rate": 1e-06, "loss": 0.2466, "step": 455 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.6171875, "entropy": 0.2619156539440155, "epoch": 0.7307692307692307, "grad_norm": 6.435583114624023, "learning_rate": 1e-06, "loss": 0.3236, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.205078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7699.0, "completions/mean_length": 6036.04296875, "completions/mean_terminated_length": 3366.422607421875, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "entropy": 0.30352291464805603, "epoch": 0.7323717948717948, "frac_reward_zero_std": 0.0, "grad_norm": 363.8023681640625, "learning_rate": 1e-06, "loss": 0.1275, "num_tokens": 456286823.0, "reward": 0.13116447627544403, "reward_std": 0.14602042734622955, "rewards/progression_diversity/mean": -0.00855403020977974, "rewards/progression_diversity/std": 0.028096795082092285, "rewards/symbolic_reward_accuracy/mean": 0.03515625, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.4192708432674408, "rewards/symbolic_reward_partial_score/std": 0.30005842447280884, "rewards/tag_count_reward/mean": -0.15625, "rewards/tag_count_reward/std": 0.36344730854034424, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0444647073745728, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 430.0, "sampling/sampling_logp_difference/mean": 5.1553144454956055, "step": 457 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2788728326559067, "epoch": 0.7339743589743589, "grad_norm": 0.03529650345444679, "learning_rate": 1e-06, "loss": 0.2712, "step": 458 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2527826279401779, "epoch": 0.7355769230769231, "grad_norm": 0.02542269416153431, "learning_rate": 1e-06, "loss": 0.3004, "step": 459 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.27167366445064545, "epoch": 0.7371794871794872, "grad_norm": 0.02993706800043583, "learning_rate": 1e-06, "loss": 0.2269, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.185546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7170.0, "completions/mean_length": 5713.298828125, "completions/mean_terminated_length": 3282.32373046875, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "entropy": 0.2820378243923187, "epoch": 0.7387820512820513, "frac_reward_zero_std": 0.0, "grad_norm": 278.36627197265625, "learning_rate": 1e-06, "loss": 0.1824, "num_tokens": 460071312.0, "reward": 0.17337031662464142, "reward_std": 0.17275133728981018, "rewards/progression_diversity/mean": -0.004277626518160105, "rewards/progression_diversity/std": 0.01868487522006035, "rewards/symbolic_reward_accuracy/mean": 0.072265625, "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, "rewards/symbolic_reward_partial_score/mean": 0.485595703125, "rewards/symbolic_reward_partial_score/std": 0.31720206141471863, "rewards/tag_count_reward/mean": -0.15625, "rewards/tag_count_reward/std": 0.36344730854034424, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0428156852722168, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 456.0, "sampling/sampling_logp_difference/mean": 5.391382217407227, "step": 461 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2974891811609268, "epoch": 0.7403846153846154, "grad_norm": 0.03358296677470207, "learning_rate": 1e-06, "loss": 0.1633, "step": 462 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5546875, "entropy": 0.27032434195280075, "epoch": 0.7419871794871795, "grad_norm": 0.03503183275461197, "learning_rate": 1e-06, "loss": 0.2628, "step": 463 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2741468995809555, "epoch": 0.7435897435897436, "grad_norm": 0.047111768275499344, "learning_rate": 1e-06, "loss": 0.2698, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.224609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15885.0, "completions/mean_length": 6353.427734375, "completions/mean_terminated_length": 3447.84619140625, "completions/min_length": 490.0, "completions/min_terminated_length": 490.0, "entropy": 0.259844645857811, "epoch": 0.7451923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 823.7109985351562, "learning_rate": 1e-06, "loss": 0.2066, "num_tokens": 464320411.0, "reward": 0.13118945062160492, "reward_std": 0.14172431826591492, "rewards/progression_diversity/mean": -0.009473828598856926, "rewards/progression_diversity/std": 0.026718810200691223, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.42329102754592896, "rewards/symbolic_reward_partial_score/std": 0.2852300703525543, "rewards/tag_count_reward/mean": -0.19140625, "rewards/tag_count_reward/std": 0.3937928080558777, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0352518558502197, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 482.0, "sampling/sampling_logp_difference/mean": 8.388681411743164, "step": 465 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.3671875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.609375, "entropy": 0.25056569278240204, "epoch": 0.7467948717948718, "grad_norm": 0.02025044709444046, "learning_rate": 1e-06, "loss": 0.2388, "step": 466 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.25776257365942, "epoch": 0.7483974358974359, "grad_norm": 0.026951203122735023, "learning_rate": 1e-06, "loss": 0.248, "step": 467 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.25720326602458954, "epoch": 0.75, "grad_norm": 0.03187187388539314, "learning_rate": 1e-06, "loss": 0.2192, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.154296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 12745.0, "completions/mean_length": 5212.98828125, "completions/mean_terminated_length": 3174.859130859375, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "entropy": 0.28627626597881317, "epoch": 0.7516025641025641, "frac_reward_zero_std": 0.0, "grad_norm": 330.2890930175781, "learning_rate": 1e-06, "loss": 0.1308, "num_tokens": 467844069.0, "reward": 0.1364753395318985, "reward_std": 0.13440468907356262, "rewards/progression_diversity/mean": -0.009204398840665817, "rewards/progression_diversity/std": 0.028541414067149162, "rewards/symbolic_reward_accuracy/mean": 0.021484375, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.4545735716819763, "rewards/symbolic_reward_partial_score/std": 0.28182342648506165, "rewards/tag_count_reward/mean": -0.126953125, "rewards/tag_count_reward/std": 0.33324605226516724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0359776020050049, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 500.0, "sampling/sampling_logp_difference/mean": 9.518253326416016, "step": 469 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2785252183675766, "epoch": 0.7532051282051282, "grad_norm": 0.04603856801986694, "learning_rate": 1e-06, "loss": 0.226, "step": 470 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.28009602427482605, "epoch": 0.7548076923076923, "grad_norm": 0.03989138454198837, "learning_rate": 1e-06, "loss": 0.1848, "step": 471 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.28221395611763, "epoch": 0.7564102564102564, "grad_norm": 0.04469291865825653, "learning_rate": 1e-06, "loss": 0.1911, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7659.0, "completions/mean_length": 5128.966796875, "completions/mean_terminated_length": 3106.1728515625, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "entropy": 0.26698917895555496, "epoch": 0.7580128205128205, "frac_reward_zero_std": 0.0, "grad_norm": 593.4261474609375, "learning_rate": 1e-06, "loss": 0.2374, "num_tokens": 471326436.0, "reward": 0.17709538340568542, "reward_std": 0.15860500931739807, "rewards/progression_diversity/mean": -0.008724970743060112, "rewards/progression_diversity/std": 0.03066168539226055, "rewards/symbolic_reward_accuracy/mean": 0.07421875, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.48448890447616577, "rewards/symbolic_reward_partial_score/std": 0.2948387861251831, "rewards/tag_count_reward/mean": -0.126953125, "rewards/tag_count_reward/std": 0.33324605226516724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0322108268737793, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 516.0, "sampling/sampling_logp_difference/mean": 10.949844360351562, "step": 473 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.28319013118743896, "epoch": 0.7596153846153846, "grad_norm": 0.02427232451736927, "learning_rate": 1e-06, "loss": 0.196, "step": 474 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.29725903272628784, "epoch": 0.7612179487179487, "grad_norm": 0.03828584402799606, "learning_rate": 1e-06, "loss": 0.1165, "step": 475 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.2925011217594147, "epoch": 0.7628205128205128, "grad_norm": 0.030051294714212418, "learning_rate": 1e-06, "loss": 0.1457, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.173828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 6983.0, "completions/mean_length": 5560.11328125, "completions/mean_terminated_length": 3282.7470703125, "completions/min_length": 565.0, "completions/min_terminated_length": 565.0, "entropy": 0.2824954092502594, "epoch": 0.7644230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 1322.20654296875, "learning_rate": 1e-06, "loss": 0.126, "num_tokens": 475077582.0, "reward": 0.1587892770767212, "reward_std": 0.146462082862854, "rewards/progression_diversity/mean": -0.010721936821937561, "rewards/progression_diversity/std": 0.032617803663015366, "rewards/symbolic_reward_accuracy/mean": 0.0625, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.45152994990348816, "rewards/symbolic_reward_partial_score/std": 0.2799621820449829, "rewards/tag_count_reward/mean": -0.140625, "rewards/tag_count_reward/std": 0.3479743003845215, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0238516330718994, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 532.0, "sampling/sampling_logp_difference/mean": 12.856636047363281, "step": 477 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5390625, "entropy": 0.24741245806217194, "epoch": 0.7660256410256411, "grad_norm": 0.03778098151087761, "learning_rate": 1e-06, "loss": 0.2498, "step": 478 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.26974865794181824, "epoch": 0.7676282051282052, "grad_norm": 0.03347004950046539, "learning_rate": 1e-06, "loss": 0.193, "step": 479 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2672978490591049, "epoch": 0.7692307692307693, "grad_norm": 7.450833320617676, "learning_rate": 1e-06, "loss": 0.1819, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15833.0, "completions/mean_length": 5183.740234375, "completions/mean_terminated_length": 3109.6181640625, "completions/min_length": 548.0, "completions/min_terminated_length": 548.0, "entropy": 0.27999909222126007, "epoch": 0.7708333333333334, "frac_reward_zero_std": 0.0, "grad_norm": 328.314208984375, "learning_rate": 1e-06, "loss": 0.1661, "num_tokens": 478652985.0, "reward": 0.16447407007217407, "reward_std": 0.16247986257076263, "rewards/progression_diversity/mean": -0.009626220911741257, "rewards/progression_diversity/std": 0.03150993958115578, "rewards/symbolic_reward_accuracy/mean": 0.0625, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.4671875238418579, "rewards/symbolic_reward_partial_score/std": 0.2948901653289795, "rewards/tag_count_reward/mean": -0.130859375, "rewards/tag_count_reward/std": 0.33757632970809937, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0216107368469238, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 544.0, "sampling/sampling_logp_difference/mean": 14.160472869873047, "step": 481 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.27840910851955414, "epoch": 0.7724358974358975, "grad_norm": 0.05339457094669342, "learning_rate": 1e-06, "loss": 0.1271, "step": 482 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.53125, "entropy": 0.2735966593027115, "epoch": 0.7740384615384616, "grad_norm": 0.03152226284146309, "learning_rate": 1e-06, "loss": 0.2084, "step": 483 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.27197548747062683, "epoch": 0.7756410256410257, "grad_norm": 0.027287054806947708, "learning_rate": 1e-06, "loss": 0.1888, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6238.0, "completions/mean_length": 5183.2421875, "completions/mean_terminated_length": 3109.02783203125, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "entropy": 0.2760499268770218, "epoch": 0.7772435897435898, "frac_reward_zero_std": 0.0, "grad_norm": 610.1857299804688, "learning_rate": 1e-06, "loss": 0.1777, "num_tokens": 482058533.0, "reward": 0.16969150304794312, "reward_std": 0.13450489938259125, "rewards/progression_diversity/mean": -0.0122962836176157, "rewards/progression_diversity/std": 0.034512363374233246, "rewards/symbolic_reward_accuracy/mean": 0.06640625, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.47880858182907104, "rewards/symbolic_reward_partial_score/std": 0.29609718918800354, "rewards/tag_count_reward/mean": -0.13671875, "rewards/tag_count_reward/std": 0.3438861668109894, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.017279863357544, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 560.0, "sampling/sampling_logp_difference/mean": 15.952376365661621, "step": 485 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2726978659629822, "epoch": 0.7788461538461539, "grad_norm": 0.029111366719007492, "learning_rate": 1e-06, "loss": 0.222, "step": 486 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.29166853427886963, "epoch": 0.780448717948718, "grad_norm": 0.03601072356104851, "learning_rate": 1e-06, "loss": 0.1032, "step": 487 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.546875, "entropy": 0.2872645705938339, "epoch": 0.782051282051282, "grad_norm": 0.02848963811993599, "learning_rate": 1e-06, "loss": 0.1847, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13892.0, "completions/mean_length": 4681.236328125, "completions/mean_terminated_length": 3068.855712890625, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.2787110507488251, "epoch": 0.7836538461538461, "frac_reward_zero_std": 0.0, "grad_norm": 408.6405944824219, "learning_rate": 1e-06, "loss": 0.2056, "num_tokens": 485320734.0, "reward": 0.17335151135921478, "reward_std": 0.15505962073802948, "rewards/progression_diversity/mean": -0.010063882917165756, "rewards/progression_diversity/std": 0.03297411650419235, "rewards/symbolic_reward_accuracy/mean": 0.05859375, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.49549156427383423, "rewards/symbolic_reward_partial_score/std": 0.2642830014228821, "rewards/tag_count_reward/mean": -0.103515625, "rewards/tag_count_reward/std": 0.30492907762527466, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0186336040496826, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 576.0, "sampling/sampling_logp_difference/mean": 15.60879135131836, "step": 489 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.28752225637435913, "epoch": 0.7852564102564102, "grad_norm": 0.027989018708467484, "learning_rate": 1e-06, "loss": 0.1339, "step": 490 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.28790904581546783, "epoch": 0.7868589743589743, "grad_norm": 0.028247352689504623, "learning_rate": 1e-06, "loss": 0.1596, "step": 491 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.2850726991891861, "epoch": 0.7884615384615384, "grad_norm": 0.030095672234892845, "learning_rate": 1e-06, "loss": 0.1193, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6460.0, "completions/mean_length": 4684.54296875, "completions/mean_terminated_length": 3247.767578125, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "entropy": 0.2784768342971802, "epoch": 0.7900641025641025, "frac_reward_zero_std": 0.0, "grad_norm": 769.5609741210938, "learning_rate": 1e-06, "loss": 0.1766, "num_tokens": 488602388.0, "reward": 0.16056574881076813, "reward_std": 0.1394985318183899, "rewards/progression_diversity/mean": -0.008366829715669155, "rewards/progression_diversity/std": 0.0340796634554863, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.4892740845680237, "rewards/symbolic_reward_partial_score/std": 0.26915544271469116, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0246632099151611, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 592.0, "sampling/sampling_logp_difference/mean": 13.432557106018066, "step": 493 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2812363803386688, "epoch": 0.7916666666666666, "grad_norm": 0.02783721312880516, "learning_rate": 1e-06, "loss": 0.1083, "step": 494 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2817336320877075, "epoch": 0.7932692307692307, "grad_norm": 0.027454564347863197, "learning_rate": 1e-06, "loss": 0.1656, "step": 495 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.28926005959510803, "epoch": 0.7948717948717948, "grad_norm": 0.052017685025930405, "learning_rate": 1e-06, "loss": 0.0861, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 6732.0, "completions/mean_length": 4107.0703125, "completions/mean_terminated_length": 3038.378173828125, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "entropy": 0.31334738433361053, "epoch": 0.7964743589743589, "frac_reward_zero_std": 0.0, "grad_norm": 333.04046630859375, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 491578936.0, "reward": 0.1752394437789917, "reward_std": 0.12819209694862366, "rewards/progression_diversity/mean": -0.006330978125333786, "rewards/progression_diversity/std": 0.025279799476265907, "rewards/symbolic_reward_accuracy/mean": 0.04296875, "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, "rewards/symbolic_reward_partial_score/mean": 0.5198893547058105, "rewards/symbolic_reward_partial_score/std": 0.25531089305877686, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0289983749389648, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 608.0, "sampling/sampling_logp_difference/mean": 12.241405487060547, "step": 497 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.27990762889385223, "epoch": 0.7980769230769231, "grad_norm": 0.03255309909582138, "learning_rate": 1e-06, "loss": 0.1956, "step": 498 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2794528156518936, "epoch": 0.7996794871794872, "grad_norm": 0.02955903671681881, "learning_rate": 1e-06, "loss": 0.1626, "step": 499 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.28499020636081696, "epoch": 0.8012820512820513, "grad_norm": 0.031167522072792053, "learning_rate": 1e-06, "loss": 0.1355, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7721.0, "completions/mean_length": 4657.025390625, "completions/mean_terminated_length": 3216.87060546875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.2538497596979141, "epoch": 0.8028846153846154, "frac_reward_zero_std": 0.0, "grad_norm": 414.44403076171875, "learning_rate": 1e-06, "loss": 0.1954, "num_tokens": 494832389.0, "reward": 0.20476049184799194, "reward_std": 0.15316171944141388, "rewards/progression_diversity/mean": -0.008816054090857506, "rewards/progression_diversity/std": 0.03022286854684353, "rewards/symbolic_reward_accuracy/mean": 0.107421875, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.49988603591918945, "rewards/symbolic_reward_partial_score/std": 0.2798597812652588, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.017579197883606, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 616.0, "sampling/sampling_logp_difference/mean": 14.574539184570312, "step": 501 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.26197493076324463, "epoch": 0.8044871794871795, "grad_norm": 209.1951141357422, "learning_rate": 1e-06, "loss": 0.1276, "step": 502 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.26250365376472473, "epoch": 0.8060897435897436, "grad_norm": 0.026032600551843643, "learning_rate": 1e-06, "loss": 0.1871, "step": 503 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.28040294349193573, "epoch": 0.8076923076923077, "grad_norm": 0.0345035195350647, "learning_rate": 1e-06, "loss": 0.0733, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6329.0, "completions/mean_length": 4636.939453125, "completions/mean_terminated_length": 3194.318115234375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.2512836679816246, "epoch": 0.8092948717948718, "frac_reward_zero_std": 0.0, "grad_norm": 1516.8017578125, "learning_rate": 1e-06, "loss": 0.2065, "num_tokens": 498112726.0, "reward": 0.14471670985221863, "reward_std": 0.1163174957036972, "rewards/progression_diversity/mean": -0.007822653278708458, "rewards/progression_diversity/std": 0.027946053072810173, "rewards/symbolic_reward_accuracy/mean": 0.01953125, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.4754882752895355, "rewards/symbolic_reward_partial_score/std": 0.25455576181411743, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0157849788665771, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 628.0, "sampling/sampling_logp_difference/mean": 15.629146575927734, "step": 505 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2723834812641144, "epoch": 0.8108974358974359, "grad_norm": 0.023276617750525475, "learning_rate": 1e-06, "loss": 0.0898, "step": 506 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.24540285021066666, "epoch": 0.8125, "grad_norm": 2.523991584777832, "learning_rate": 1e-06, "loss": 0.1915, "step": 507 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.27531081438064575, "epoch": 0.8141025641025641, "grad_norm": 0.03945612907409668, "learning_rate": 1e-06, "loss": 0.0928, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7280.0, "completions/mean_length": 4566.736328125, "completions/mean_terminated_length": 3230.87158203125, "completions/min_length": 496.0, "completions/min_terminated_length": 496.0, "entropy": 0.25356655567884445, "epoch": 0.8157051282051282, "frac_reward_zero_std": 0.0, "grad_norm": 2822.289306640625, "learning_rate": 1e-06, "loss": 0.2041, "num_tokens": 501219247.0, "reward": 0.21896670758724213, "reward_std": 0.16303659975528717, "rewards/progression_diversity/mean": -0.00762702152132988, "rewards/progression_diversity/std": 0.029197612777352333, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.5237630605697632, "rewards/symbolic_reward_partial_score/std": 0.2946349084377289, "rewards/tag_count_reward/mean": -0.083984375, "rewards/tag_count_reward/std": 0.2776356339454651, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.023378610610962, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 636.0, "sampling/sampling_logp_difference/mean": 13.583820343017578, "step": 509 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.28210754692554474, "epoch": 0.8173076923076923, "grad_norm": 0.023770952597260475, "learning_rate": 1e-06, "loss": 0.0548, "step": 510 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.26374469697475433, "epoch": 0.8189102564102564, "grad_norm": 0.039423782378435135, "learning_rate": 1e-06, "loss": 0.1377, "step": 511 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2674047648906708, "epoch": 0.8205128205128205, "grad_norm": 0.0316784493625164, "learning_rate": 1e-06, "loss": 0.1403, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6836.0, "completions/mean_length": 5036.466796875, "completions/mean_terminated_length": 3327.9619140625, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "entropy": 0.24934273958206177, "epoch": 0.8221153846153846, "frac_reward_zero_std": 0.0, "grad_norm": 2330.71728515625, "learning_rate": 1e-06, "loss": 0.1066, "num_tokens": 504675182.0, "reward": 0.14850318431854248, "reward_std": 0.10997183620929718, "rewards/progression_diversity/mean": -0.01491626538336277, "rewards/progression_diversity/std": 0.04920973256230354, "rewards/symbolic_reward_accuracy/mean": 0.02734375, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.4779297113418579, "rewards/symbolic_reward_partial_score/std": 0.262747585773468, "rewards/tag_count_reward/mean": -0.111328125, "rewards/tag_count_reward/std": 0.31484565138816833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.004473090171814, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 640.0, "sampling/sampling_logp_difference/mean": 19.370677947998047, "step": 513 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.22383376955986023, "epoch": 0.8237179487179487, "grad_norm": 0.028715649619698524, "learning_rate": 1e-06, "loss": 0.2759, "step": 514 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.23630475997924805, "epoch": 0.8253205128205128, "grad_norm": 0.03143737465143204, "learning_rate": 1e-06, "loss": 0.2012, "step": 515 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.25393127650022507, "epoch": 0.8269230769230769, "grad_norm": 0.031186457723379135, "learning_rate": 1e-06, "loss": 0.0771, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16384.0, "completions/max_terminated_length": 6311.0, "completions/mean_length": 4890.7109375, "completions/mean_terminated_length": 3248.812744140625, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.25201961398124695, "epoch": 0.8285256410256411, "frac_reward_zero_std": 0.0, "grad_norm": 291.3205261230469, "learning_rate": 1e-06, "loss": 0.1009, "num_tokens": 508036634.0, "reward": 0.1587275266647339, "reward_std": 0.1489463448524475, "rewards/progression_diversity/mean": -0.014943530783057213, "rewards/progression_diversity/std": 0.049044571816921234, "rewards/symbolic_reward_accuracy/mean": 0.04296875, "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, "rewards/symbolic_reward_partial_score/mean": 0.47750651836395264, "rewards/symbolic_reward_partial_score/std": 0.25351831316947937, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0078704357147217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 640.0, "sampling/sampling_logp_difference/mean": 17.498640060424805, "step": 517 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.249831885099411, "epoch": 0.8301282051282052, "grad_norm": 133.84860229492188, "learning_rate": 1e-06, "loss": 0.1318, "step": 518 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2366134375333786, "epoch": 0.8317307692307693, "grad_norm": 0.042024604976177216, "learning_rate": 1e-06, "loss": 0.1665, "step": 519 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.23418694734573364, "epoch": 0.8333333333333334, "grad_norm": 0.023398801684379578, "learning_rate": 1e-06, "loss": 0.1972, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 8086.0, "completions/mean_length": 4970.57421875, "completions/mean_terminated_length": 3398.057861328125, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.24243058264255524, "epoch": 0.8349358974358975, "frac_reward_zero_std": 0.0, "grad_norm": 2872.773193359375, "learning_rate": 1e-06, "loss": 0.1181, "num_tokens": 511476240.0, "reward": 0.14858070015907288, "reward_std": 0.11485429108142853, "rewards/progression_diversity/mean": -0.017418432980775833, "rewards/progression_diversity/std": 0.056952327489852905, "rewards/symbolic_reward_accuracy/mean": 0.033203125, "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, "rewards/symbolic_reward_partial_score/mean": 0.4639485478401184, "rewards/symbolic_reward_partial_score/std": 0.2507692277431488, "rewards/tag_count_reward/mean": -0.103515625, "rewards/tag_count_reward/std": 0.30492907762527466, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.008687973022461, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 648.0, "sampling/sampling_logp_difference/mean": 16.52926254272461, "step": 521 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.23126275092363358, "epoch": 0.8365384615384616, "grad_norm": 10394.248046875, "learning_rate": 1e-06, "loss": 1.4171, "step": 522 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2367812767624855, "epoch": 0.8381410256410257, "grad_norm": 2017.530517578125, "learning_rate": 1e-06, "loss": 0.5716, "step": 523 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.21209017932415009, "epoch": 0.8397435897435898, "grad_norm": 100393.40625, "learning_rate": 1e-06, "loss": 8.2323, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8267.0, "completions/mean_length": 4735.314453125, "completions/mean_terminated_length": 3418.50634765625, "completions/min_length": 669.0, "completions/min_terminated_length": 669.0, "entropy": 0.24131416529417038, "epoch": 0.8413461538461539, "frac_reward_zero_std": 0.0, "grad_norm": 1308.060302734375, "learning_rate": 1e-06, "loss": 0.0936, "num_tokens": 514695857.0, "reward": 0.18877416849136353, "reward_std": 0.15265323221683502, "rewards/progression_diversity/mean": -0.018579598516225815, "rewards/progression_diversity/std": 0.0655272826552391, "rewards/symbolic_reward_accuracy/mean": 0.078125, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.5009602904319763, "rewards/symbolic_reward_partial_score/std": 0.28437086939811707, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0153002738952637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 656.0, "sampling/sampling_logp_difference/mean": 14.62424087524414, "step": 525 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2314378321170807, "epoch": 0.842948717948718, "grad_norm": 2592.095703125, "learning_rate": 1e-06, "loss": 0.7228, "step": 526 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24521462619304657, "epoch": 0.844551282051282, "grad_norm": 108445.40625, "learning_rate": 1e-06, "loss": 21.7594, "step": 527 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.23320768773555756, "epoch": 0.8461538461538461, "grad_norm": 3017.028076171875, "learning_rate": 1e-06, "loss": 0.684, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6846.0, "completions/mean_length": 5188.595703125, "completions/mean_terminated_length": 3502.99560546875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.23708263784646988, "epoch": 0.8477564102564102, "frac_reward_zero_std": 0.0, "grad_norm": 436.8207092285156, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 518277074.0, "reward": 0.16587716341018677, "reward_std": 0.13942670822143555, "rewards/progression_diversity/mean": -0.018729275092482567, "rewards/progression_diversity/std": 0.06120576709508896, "rewards/symbolic_reward_accuracy/mean": 0.05078125, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.4884440302848816, "rewards/symbolic_reward_partial_score/std": 0.27574726939201355, "rewards/tag_count_reward/mean": -0.109375, "rewards/tag_count_reward/std": 0.31241437792778015, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0088396072387695, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 668.0, "sampling/sampling_logp_difference/mean": 15.507166862487793, "step": 529 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2199697643518448, "epoch": 0.8493589743589743, "grad_norm": 0.035212092101573944, "learning_rate": 1e-06, "loss": 0.1937, "step": 530 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.22101637721061707, "epoch": 0.8509615384615384, "grad_norm": 0.045379314571619034, "learning_rate": 1e-06, "loss": 0.1429, "step": 531 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.18882086127996445, "epoch": 0.8525641025641025, "grad_norm": 0.029790926724672318, "learning_rate": 1e-06, "loss": 0.2997, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7388.0, "completions/mean_length": 4781.244140625, "completions/mean_terminated_length": 3327.712158203125, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "entropy": 0.24219012260437012, "epoch": 0.8541666666666666, "frac_reward_zero_std": 0.0, "grad_norm": 1252.3094482421875, "learning_rate": 1e-06, "loss": 0.0937, "num_tokens": 521490175.0, "reward": 0.19383737444877625, "reward_std": 0.18049459159374237, "rewards/progression_diversity/mean": -0.01616620644927025, "rewards/progression_diversity/std": 0.057862453162670135, "rewards/symbolic_reward_accuracy/mean": 0.0859375, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.5060384273529053, "rewards/symbolic_reward_partial_score/std": 0.28731000423431396, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0143693685531616, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 680.0, "sampling/sampling_logp_difference/mean": 14.317965507507324, "step": 533 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.22050420194864273, "epoch": 0.8557692307692307, "grad_norm": 0.0265819001942873, "learning_rate": 1e-06, "loss": 0.17, "step": 534 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.23473266512155533, "epoch": 0.8573717948717948, "grad_norm": 0.02770194783806801, "learning_rate": 1e-06, "loss": 0.1197, "step": 535 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23300082236528397, "epoch": 0.8589743589743589, "grad_norm": 0.03747548907995224, "learning_rate": 1e-06, "loss": 0.1186, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14631.0, "completions/mean_length": 5389.201171875, "completions/mean_terminated_length": 3733.804443359375, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "entropy": 0.20381084084510803, "epoch": 0.8605769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 805.9613037109375, "learning_rate": 1e-06, "loss": 0.1618, "num_tokens": 525175254.0, "reward": 0.14221106469631195, "reward_std": 0.12302105128765106, "rewards/progression_diversity/mean": -0.0181512963026762, "rewards/progression_diversity/std": 0.06077505275607109, "rewards/symbolic_reward_accuracy/mean": 0.029296875, "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, "rewards/symbolic_reward_partial_score/mean": 0.44860023260116577, "rewards/symbolic_reward_partial_score/std": 0.24405843019485474, "rewards/tag_count_reward/mean": -0.09765625, "rewards/tag_count_reward/std": 0.29713961482048035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.005043387413025, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 680.0, "sampling/sampling_logp_difference/mean": 17.193443298339844, "step": 537 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.20633187144994736, "epoch": 0.8621794871794872, "grad_norm": 0.02656022645533085, "learning_rate": 1e-06, "loss": 0.13, "step": 538 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2009679228067398, "epoch": 0.8637820512820513, "grad_norm": 0.0328981950879097, "learning_rate": 1e-06, "loss": 0.2296, "step": 539 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.22686323523521423, "epoch": 0.8653846153846154, "grad_norm": 0.034096311777830124, "learning_rate": 1e-06, "loss": 0.1219, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 8314.0, "completions/mean_length": 5590.28515625, "completions/mean_terminated_length": 3650.40087890625, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "entropy": 0.18773535639047623, "epoch": 0.8669871794871795, "frac_reward_zero_std": 0.0, "grad_norm": 1792.0411376953125, "learning_rate": 1e-06, "loss": 0.3456, "num_tokens": 528906344.0, "reward": 0.12807568907737732, "reward_std": 0.12272882461547852, "rewards/progression_diversity/mean": -0.020069371908903122, "rewards/progression_diversity/std": 0.0620153546333313, "rewards/symbolic_reward_accuracy/mean": 0.017578125, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.4406087398529053, "rewards/symbolic_reward_partial_score/std": 0.2600206136703491, "rewards/tag_count_reward/mean": -0.14453125, "rewards/tag_count_reward/std": 0.35197147727012634, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999688982963562, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 684.0, "sampling/sampling_logp_difference/mean": 18.892946243286133, "step": 541 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2230483591556549, "epoch": 0.8685897435897436, "grad_norm": 0.030025260522961617, "learning_rate": 1e-06, "loss": 0.0836, "step": 542 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.21129625290632248, "epoch": 0.8701923076923077, "grad_norm": 0.029054047539830208, "learning_rate": 1e-06, "loss": 0.1741, "step": 543 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.21284447610378265, "epoch": 0.8717948717948718, "grad_norm": 0.03237487003207207, "learning_rate": 1e-06, "loss": 0.1292, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.146484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7899.0, "completions/mean_length": 5516.55078125, "completions/mean_terminated_length": 3651.427734375, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "entropy": 0.22282154858112335, "epoch": 0.8733974358974359, "frac_reward_zero_std": 0.0, "grad_norm": 214.0397186279297, "learning_rate": 1e-06, "loss": 0.0983, "num_tokens": 532566914.0, "reward": 0.13807618618011475, "reward_std": 0.11777358502149582, "rewards/progression_diversity/mean": -0.013183235190808773, "rewards/progression_diversity/std": 0.04880344495177269, "rewards/symbolic_reward_accuracy/mean": 0.01953125, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.4639485776424408, "rewards/symbolic_reward_partial_score/std": 0.25486207008361816, "rewards/tag_count_reward/mean": -0.126953125, "rewards/tag_count_reward/std": 0.33324605226516724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0068590641021729, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 684.0, "sampling/sampling_logp_difference/mean": 16.14547348022461, "step": 545 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.20825005322694778, "epoch": 0.875, "grad_norm": 0.026716945692896843, "learning_rate": 1e-06, "loss": 0.1561, "step": 546 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.19253433495759964, "epoch": 0.8766025641025641, "grad_norm": 0.5219494104385376, "learning_rate": 1e-06, "loss": 0.2689, "step": 547 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.21415498107671738, "epoch": 0.8782051282051282, "grad_norm": 0.0328250527381897, "learning_rate": 1e-06, "loss": 0.1751, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7527.0, "completions/mean_length": 5125.4140625, "completions/mean_terminated_length": 3574.231201171875, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "entropy": 0.20042593777179718, "epoch": 0.8798076923076923, "frac_reward_zero_std": 0.0, "grad_norm": 668.1113891601562, "learning_rate": 1e-06, "loss": 0.172, "num_tokens": 536149046.0, "reward": 0.15214072167873383, "reward_std": 0.13333392143249512, "rewards/progression_diversity/mean": -0.01493225246667862, "rewards/progression_diversity/std": 0.061123333871364594, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.4685709476470947, "rewards/symbolic_reward_partial_score/std": 0.26284727454185486, "rewards/tag_count_reward/mean": -0.1171875, "rewards/tag_count_reward/std": 0.32195815443992615, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.012796401977539, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 684.0, "sampling/sampling_logp_difference/mean": 12.92884349822998, "step": 549 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.21232055872678757, "epoch": 0.8814102564102564, "grad_norm": 0.02926452085375786, "learning_rate": 1e-06, "loss": 0.1224, "step": 550 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.20979397743940353, "epoch": 0.8830128205128205, "grad_norm": 0.02844247967004776, "learning_rate": 1e-06, "loss": 0.1311, "step": 551 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.1974465250968933, "epoch": 0.8846153846153846, "grad_norm": 0.9733312129974365, "learning_rate": 1e-06, "loss": 0.2059, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13505.0, "completions/mean_length": 5052.701171875, "completions/mean_terminated_length": 3688.978271484375, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.2104191780090332, "epoch": 0.8862179487179487, "frac_reward_zero_std": 0.0, "grad_norm": 485.2064514160156, "learning_rate": 1e-06, "loss": 0.1146, "num_tokens": 539713629.0, "reward": 0.15813249349594116, "reward_std": 0.12568315863609314, "rewards/progression_diversity/mean": -0.006576837040483952, "rewards/progression_diversity/std": 0.032705530524253845, "rewards/symbolic_reward_accuracy/mean": 0.03515625, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.4889160096645355, "rewards/symbolic_reward_partial_score/std": 0.24816220998764038, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0161933898925781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 684.0, "sampling/sampling_logp_difference/mean": 11.277822494506836, "step": 553 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.20011521875858307, "epoch": 0.8878205128205128, "grad_norm": 170.6609344482422, "learning_rate": 1e-06, "loss": 0.1556, "step": 554 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.19730040431022644, "epoch": 0.8894230769230769, "grad_norm": 0.03489186614751816, "learning_rate": 1e-06, "loss": 0.1825, "step": 555 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2150089591741562, "epoch": 0.8910256410256411, "grad_norm": 2.223613977432251, "learning_rate": 1e-06, "loss": 0.1191, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.146484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7381.0, "completions/mean_length": 5520.578125, "completions/mean_terminated_length": 3656.146240234375, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "entropy": 0.21337107568979263, "epoch": 0.8926282051282052, "frac_reward_zero_std": 0.0, "grad_norm": 2374.782470703125, "learning_rate": 1e-06, "loss": 0.0962, "num_tokens": 543413141.0, "reward": 0.1618734747171402, "reward_std": 0.1597273349761963, "rewards/progression_diversity/mean": -0.014801505953073502, "rewards/progression_diversity/std": 0.047020554542541504, "rewards/symbolic_reward_accuracy/mean": 0.06640625, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.44892579317092896, "rewards/symbolic_reward_partial_score/std": 0.27913251519203186, "rewards/tag_count_reward/mean": -0.125, "rewards/tag_count_reward/std": 0.3310423493385315, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9986110925674438, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 688.0, "sampling/sampling_logp_difference/mean": 18.847614288330078, "step": 557 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.18713711947202682, "epoch": 0.8942307692307693, "grad_norm": 461.1898193359375, "learning_rate": 1e-06, "loss": 0.229, "step": 558 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.19299649447202682, "epoch": 0.8958333333333334, "grad_norm": 0.025606518611311913, "learning_rate": 1e-06, "loss": 0.2068, "step": 559 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.1916685476899147, "epoch": 0.8974358974358975, "grad_norm": 0.03450322896242142, "learning_rate": 1e-06, "loss": 0.1615, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 8600.0, "completions/mean_length": 5390.927734375, "completions/mean_terminated_length": 3735.791015625, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "entropy": 0.20245271921157837, "epoch": 0.8990384615384616, "frac_reward_zero_std": 0.0, "grad_norm": 967.3348999023438, "learning_rate": 1e-06, "loss": 0.1123, "num_tokens": 547056576.0, "reward": 0.13580447435379028, "reward_std": 0.12063577026128769, "rewards/progression_diversity/mean": -0.0128158088773489, "rewards/progression_diversity/std": 0.04528747498989105, "rewards/symbolic_reward_accuracy/mean": 0.015625, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.4648274779319763, "rewards/symbolic_reward_partial_score/std": 0.2545829117298126, "rewards/tag_count_reward/mean": -0.12890625, "rewards/tag_count_reward/std": 0.33542385697364807, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0074546337127686, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 696.0, "sampling/sampling_logp_difference/mean": 15.149176597595215, "step": 561 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.1998009830713272, "epoch": 0.9006410256410257, "grad_norm": 0.021788692101836205, "learning_rate": 1e-06, "loss": 0.1464, "step": 562 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.1879931315779686, "epoch": 0.9022435897435898, "grad_norm": 0.03801717236638069, "learning_rate": 1e-06, "loss": 0.2225, "step": 563 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.19708418101072311, "epoch": 0.9038461538461539, "grad_norm": 0.028179530054330826, "learning_rate": 1e-06, "loss": 0.199, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.123046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7721.0, "completions/mean_length": 5477.939453125, "completions/mean_terminated_length": 3947.6904296875, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "entropy": 0.1928836703300476, "epoch": 0.905448717948718, "frac_reward_zero_std": 0.0, "grad_norm": 1236.47900390625, "learning_rate": 1e-06, "loss": 0.1333, "num_tokens": 550782689.0, "reward": 0.1491735279560089, "reward_std": 0.13501743972301483, "rewards/progression_diversity/mean": -0.01135750487446785, "rewards/progression_diversity/std": 0.042249299585819244, "rewards/symbolic_reward_accuracy/mean": 0.04296875, "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, "rewards/symbolic_reward_partial_score/mean": 0.4500976502895355, "rewards/symbolic_reward_partial_score/std": 0.2508525252342224, "rewards/tag_count_reward/mean": -0.115234375, "rewards/tag_count_reward/std": 0.3196168541908264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.008751392364502, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 696.0, "sampling/sampling_logp_difference/mean": 14.606096267700195, "step": 565 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.19845175743103027, "epoch": 0.907051282051282, "grad_norm": 32.34842300415039, "learning_rate": 1e-06, "loss": 0.1143, "step": 566 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.185062974691391, "epoch": 0.9086538461538461, "grad_norm": 26.105823516845703, "learning_rate": 1e-06, "loss": 0.2236, "step": 567 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.190422885119915, "epoch": 0.9102564102564102, "grad_norm": 0.02777501754462719, "learning_rate": 1e-06, "loss": 0.1594, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 8955.0, "completions/mean_length": 5115.3125, "completions/mean_terminated_length": 3647.646728515625, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "entropy": 0.18562601506710052, "epoch": 0.9118589743589743, "frac_reward_zero_std": 0.0, "grad_norm": 2351.0830078125, "learning_rate": 1e-06, "loss": 0.1649, "num_tokens": 554328513.0, "reward": 0.1948603093624115, "reward_std": 0.1654394268989563, "rewards/progression_diversity/mean": -0.011040431447327137, "rewards/progression_diversity/std": 0.044010140001773834, "rewards/symbolic_reward_accuracy/mean": 0.091796875, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.49755859375, "rewards/symbolic_reward_partial_score/std": 0.2728171944618225, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0104902982711792, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 700.0, "sampling/sampling_logp_difference/mean": 13.494155883789062, "step": 569 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.20058371126651764, "epoch": 0.9134615384615384, "grad_norm": 0.0499381348490715, "learning_rate": 1e-06, "loss": 0.1804, "step": 570 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.19234994798898697, "epoch": 0.9150641025641025, "grad_norm": 0.04124247282743454, "learning_rate": 1e-06, "loss": 0.1146, "step": 571 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.20797397196292877, "epoch": 0.9166666666666666, "grad_norm": 0.028247077018022537, "learning_rate": 1e-06, "loss": 0.0817, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 10550.0, "completions/mean_length": 4636.544921875, "completions/mean_terminated_length": 3694.765625, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "entropy": 0.21711117029190063, "epoch": 0.9182692307692307, "frac_reward_zero_std": 0.0, "grad_norm": 234.86900329589844, "learning_rate": 1e-06, "loss": 0.0689, "num_tokens": 557529192.0, "reward": 0.20868007838726044, "reward_std": 0.16052421927452087, "rewards/progression_diversity/mean": -0.0065048919059336185, "rewards/progression_diversity/std": 0.03668885678052902, "rewards/symbolic_reward_accuracy/mean": 0.1015625, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.5161295533180237, "rewards/symbolic_reward_partial_score/std": 0.28020334243774414, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0254935026168823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 700.0, "sampling/sampling_logp_difference/mean": 8.34628963470459, "step": 573 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.20679805427789688, "epoch": 0.9198717948717948, "grad_norm": 0.06306414306163788, "learning_rate": 1e-06, "loss": 0.1743, "step": 574 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.21248594671487808, "epoch": 0.9214743589743589, "grad_norm": 0.033059295266866684, "learning_rate": 1e-06, "loss": 0.1313, "step": 575 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.21614834666252136, "epoch": 0.9230769230769231, "grad_norm": 0.023236755281686783, "learning_rate": 1e-06, "loss": 0.0916, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7749.0, "completions/mean_length": 5531.244140625, "completions/mean_terminated_length": 3897.233642578125, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "entropy": 0.1956375539302826, "epoch": 0.9246794871794872, "frac_reward_zero_std": 0.0, "grad_norm": 377.7513122558594, "learning_rate": 1e-06, "loss": 0.0834, "num_tokens": 561189509.0, "reward": 0.15623126924037933, "reward_std": 0.13696351647377014, "rewards/progression_diversity/mean": -0.006757441908121109, "rewards/progression_diversity/std": 0.02823774889111519, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.4793294072151184, "rewards/symbolic_reward_partial_score/std": 0.26099953055381775, "rewards/tag_count_reward/mean": -0.109375, "rewards/tag_count_reward/std": 0.31241437792778015, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0143134593963623, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 704.0, "sampling/sampling_logp_difference/mean": 11.73034381866455, "step": 577 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.19732258468866348, "epoch": 0.9262820512820513, "grad_norm": 0.030800944194197655, "learning_rate": 1e-06, "loss": 0.1879, "step": 578 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.19447887688875198, "epoch": 0.9278846153846154, "grad_norm": 0.03231598064303398, "learning_rate": 1e-06, "loss": 0.1604, "step": 579 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.19631733745336533, "epoch": 0.9294871794871795, "grad_norm": 0.0303302314132452, "learning_rate": 1e-06, "loss": 0.176, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.119140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10004.0, "completions/mean_length": 5372.26953125, "completions/mean_terminated_length": 3882.8779296875, "completions/min_length": 494.0, "completions/min_terminated_length": 494.0, "entropy": 0.19707387685775757, "epoch": 0.9310897435897436, "frac_reward_zero_std": 0.0, "grad_norm": 645.3209838867188, "learning_rate": 1e-06, "loss": 0.1367, "num_tokens": 564833183.0, "reward": 0.14636225998401642, "reward_std": 0.13317109644412994, "rewards/progression_diversity/mean": -0.0073294322937726974, "rewards/progression_diversity/std": 0.031211169436573982, "rewards/symbolic_reward_accuracy/mean": 0.03515625, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.4542643129825592, "rewards/symbolic_reward_partial_score/std": 0.2500608563423157, "rewards/tag_count_reward/mean": -0.109375, "rewards/tag_count_reward/std": 0.31241437792778015, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0127942562103271, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 704.0, "sampling/sampling_logp_difference/mean": 12.442346572875977, "step": 581 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.1957729086279869, "epoch": 0.9326923076923077, "grad_norm": 0.03353438526391983, "learning_rate": 1e-06, "loss": 0.121, "step": 582 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.19240333884954453, "epoch": 0.9342948717948718, "grad_norm": 6.355886936187744, "learning_rate": 1e-06, "loss": 0.1826, "step": 583 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.1818048357963562, "epoch": 0.9358974358974359, "grad_norm": 0.03699595853686333, "learning_rate": 1e-06, "loss": 0.1854, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11314.0, "completions/mean_length": 5186.060546875, "completions/mean_terminated_length": 3755.486572265625, "completions/min_length": 563.0, "completions/min_terminated_length": 563.0, "entropy": 0.1932847574353218, "epoch": 0.9375, "frac_reward_zero_std": 0.0, "grad_norm": 567.5565185546875, "learning_rate": 1e-06, "loss": 0.1316, "num_tokens": 568426046.0, "reward": 0.18020448088645935, "reward_std": 0.1428624838590622, "rewards/progression_diversity/mean": -0.00933791883289814, "rewards/progression_diversity/std": 0.04019331932067871, "rewards/symbolic_reward_accuracy/mean": 0.07421875, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.4857584536075592, "rewards/symbolic_reward_partial_score/std": 0.26446202397346497, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.010314702987671, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 704.0, "sampling/sampling_logp_difference/mean": 13.445650100708008, "step": 585 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.193020299077034, "epoch": 0.9391025641025641, "grad_norm": 0.04160737618803978, "learning_rate": 1e-06, "loss": 0.1617, "step": 586 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.19402997195720673, "epoch": 0.9407051282051282, "grad_norm": 0.02597195655107498, "learning_rate": 1e-06, "loss": 0.1175, "step": 587 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.1869238093495369, "epoch": 0.9423076923076923, "grad_norm": 0.028201278299093246, "learning_rate": 1e-06, "loss": 0.1579, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8968.0, "completions/mean_length": 5071.556640625, "completions/mean_terminated_length": 3765.32666015625, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "entropy": 0.20189929753541946, "epoch": 0.9439102564102564, "frac_reward_zero_std": 0.0, "grad_norm": 732.089111328125, "learning_rate": 1e-06, "loss": 0.1303, "num_tokens": 571972779.0, "reward": 0.1497948169708252, "reward_std": 0.12462593615055084, "rewards/progression_diversity/mean": -0.00635932432487607, "rewards/progression_diversity/std": 0.03013550490140915, "rewards/symbolic_reward_accuracy/mean": 0.03515625, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.4578613340854645, "rewards/symbolic_reward_partial_score/std": 0.25101757049560547, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.014004111289978, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 708.0, "sampling/sampling_logp_difference/mean": 11.998255729675293, "step": 589 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.19095805287361145, "epoch": 0.9455128205128205, "grad_norm": 124.67987060546875, "learning_rate": 1e-06, "loss": 0.1042, "step": 590 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.19328931719064713, "epoch": 0.9471153846153846, "grad_norm": 0.03191890940070152, "learning_rate": 1e-06, "loss": 0.1363, "step": 591 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.18839430809020996, "epoch": 0.9487179487179487, "grad_norm": 0.5878300666809082, "learning_rate": 1e-06, "loss": 0.1515, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8919.0, "completions/mean_length": 5028.666015625, "completions/mean_terminated_length": 3799.733642578125, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.20535209774971008, "epoch": 0.9503205128205128, "frac_reward_zero_std": 0.0, "grad_norm": 697.3111572265625, "learning_rate": 1e-06, "loss": 0.1668, "num_tokens": 575335136.0, "reward": 0.1823686808347702, "reward_std": 0.14714612066745758, "rewards/progression_diversity/mean": -0.006297005340456963, "rewards/progression_diversity/std": 0.02788672409951687, "rewards/symbolic_reward_accuracy/mean": 0.064453125, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.5111002922058105, "rewards/symbolic_reward_partial_score/std": 0.2809692621231079, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.015028953552246, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 712.0, "sampling/sampling_logp_difference/mean": 12.842995643615723, "step": 593 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2099175900220871, "epoch": 0.9519230769230769, "grad_norm": 0.03140028938651085, "learning_rate": 1e-06, "loss": 0.0722, "step": 594 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.19498909264802933, "epoch": 0.9535256410256411, "grad_norm": 0.029133038595318794, "learning_rate": 1e-06, "loss": 0.1713, "step": 595 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.20219064503908157, "epoch": 0.9551282051282052, "grad_norm": 253.06321716308594, "learning_rate": 1e-06, "loss": 0.1111, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.087890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7837.0, "completions/mean_length": 4973.22265625, "completions/mean_terminated_length": 3873.68310546875, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "entropy": 0.21754413098096848, "epoch": 0.9567307692307693, "frac_reward_zero_std": 0.0, "grad_norm": 279.1087646484375, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 578751282.0, "reward": 0.14907152950763702, "reward_std": 0.11073170602321625, "rewards/progression_diversity/mean": -0.004956918768584728, "rewards/progression_diversity/std": 0.026602579280734062, "rewards/symbolic_reward_accuracy/mean": 0.029296875, "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, "rewards/symbolic_reward_partial_score/mean": 0.4677734375, "rewards/symbolic_reward_partial_score/std": 0.25228434801101685, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.02189040184021, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 712.0, "sampling/sampling_logp_difference/mean": 9.754007339477539, "step": 597 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.20453065633773804, "epoch": 0.9583333333333334, "grad_norm": 189.28167724609375, "learning_rate": 1e-06, "loss": 0.2374, "step": 598 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.20873209089040756, "epoch": 0.9599358974358975, "grad_norm": 47.94603729248047, "learning_rate": 1e-06, "loss": 0.0853, "step": 599 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.19657935947179794, "epoch": 0.9615384615384616, "grad_norm": 0.04143374040722847, "learning_rate": 1e-06, "loss": 0.2157, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13524.0, "completions/mean_length": 5178.005859375, "completions/mean_terminated_length": 3774.17822265625, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.20645452290773392, "epoch": 0.9631410256410257, "frac_reward_zero_std": 0.0, "grad_norm": 354.8719482421875, "learning_rate": 1e-06, "loss": 0.1344, "num_tokens": 582303205.0, "reward": 0.1530710607767105, "reward_std": 0.11708801239728928, "rewards/progression_diversity/mean": -0.0058823819272220135, "rewards/progression_diversity/std": 0.034123148769140244, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.4681152403354645, "rewards/symbolic_reward_partial_score/std": 0.2567155063152313, "rewards/tag_count_reward/mean": -0.107421875, "rewards/tag_count_reward/std": 0.30995169281959534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.022865891456604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 716.0, "sampling/sampling_logp_difference/mean": 8.755578994750977, "step": 601 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2079230472445488, "epoch": 0.9647435897435898, "grad_norm": 0.13820700347423553, "learning_rate": 1e-06, "loss": 0.154, "step": 602 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2013714388012886, "epoch": 0.9663461538461539, "grad_norm": 0.05277189984917641, "learning_rate": 1e-06, "loss": 0.082, "step": 603 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.20772968977689743, "epoch": 0.967948717948718, "grad_norm": 164.80355834960938, "learning_rate": 1e-06, "loss": 0.1195, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14972.0, "completions/mean_length": 5488.85546875, "completions/mean_terminated_length": 3932.406494140625, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "entropy": 0.18739153444766998, "epoch": 0.969551282051282, "frac_reward_zero_std": 0.0, "grad_norm": 190.9339141845703, "learning_rate": 1e-06, "loss": 0.1891, "num_tokens": 585997131.0, "reward": 0.13834749162197113, "reward_std": 0.10280528664588928, "rewards/progression_diversity/mean": -0.004118788987398148, "rewards/progression_diversity/std": 0.02404225990176201, "rewards/symbolic_reward_accuracy/mean": 0.013671875, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.47366535663604736, "rewards/symbolic_reward_partial_score/std": 0.26022228598594666, "rewards/tag_count_reward/mean": -0.119140625, "rewards/tag_count_reward/std": 0.32427072525024414, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.021909475326538, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 716.0, "sampling/sampling_logp_difference/mean": 8.64652156829834, "step": 605 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.19497393816709518, "epoch": 0.9711538461538461, "grad_norm": 1706.4573974609375, "learning_rate": 1e-06, "loss": 0.2708, "step": 606 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.20221728086471558, "epoch": 0.9727564102564102, "grad_norm": 0.10873478651046753, "learning_rate": 1e-06, "loss": 0.1281, "step": 607 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.20786841213703156, "epoch": 0.9743589743589743, "grad_norm": 0.033346377313137054, "learning_rate": 1e-06, "loss": 0.0572, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6816.0, "completions/mean_length": 5060.9296875, "completions/mean_terminated_length": 3916.447509765625, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "entropy": 0.21419016271829605, "epoch": 0.9759615384615384, "frac_reward_zero_std": 0.0, "grad_norm": 586.444580078125, "learning_rate": 1e-06, "loss": 0.0856, "num_tokens": 589427335.0, "reward": 0.16882535815238953, "reward_std": 0.1255710870027542, "rewards/progression_diversity/mean": -0.005649726837873459, "rewards/progression_diversity/std": 0.029928982257843018, "rewards/symbolic_reward_accuracy/mean": 0.044921875, "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, "rewards/symbolic_reward_partial_score/mean": 0.500439465045929, "rewards/symbolic_reward_partial_score/std": 0.2657742500305176, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.023430585861206, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 9.106830596923828, "step": 609 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.18879622966051102, "epoch": 0.9775641025641025, "grad_norm": 236.33624267578125, "learning_rate": 1e-06, "loss": 0.2667, "step": 610 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2075936570763588, "epoch": 0.9791666666666666, "grad_norm": 2521.72265625, "learning_rate": 1e-06, "loss": 0.1519, "step": 611 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2186291366815567, "epoch": 0.9807692307692307, "grad_norm": 0.11578943580389023, "learning_rate": 1e-06, "loss": 0.0334, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7141.0, "completions/mean_length": 5257.30859375, "completions/mean_terminated_length": 4053.12109375, "completions/min_length": 1060.0, "completions/min_terminated_length": 1060.0, "entropy": 0.1974191665649414, "epoch": 0.9823717948717948, "frac_reward_zero_std": 0.0, "grad_norm": 265.5615539550781, "learning_rate": 1e-06, "loss": 0.22, "num_tokens": 592999621.0, "reward": 0.16088539361953735, "reward_std": 0.10721283406019211, "rewards/progression_diversity/mean": -0.0037470583338290453, "rewards/progression_diversity/std": 0.02392842248082161, "rewards/symbolic_reward_accuracy/mean": 0.046875, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.47390952706336975, "rewards/symbolic_reward_partial_score/std": 0.23947234451770782, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0285604000091553, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 6.232402801513672, "step": 613 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.21706271171569824, "epoch": 0.9839743589743589, "grad_norm": 286162.34375, "learning_rate": 1e-06, "loss": 93.3376, "step": 614 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2068658247590065, "epoch": 0.9855769230769231, "grad_norm": 8.328322410583496, "learning_rate": 1e-06, "loss": 0.1488, "step": 615 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.21251578629016876, "epoch": 0.9871794871794872, "grad_norm": 284.380615234375, "learning_rate": 1e-06, "loss": 0.1166, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 9352.0, "completions/mean_length": 5079.92578125, "completions/mean_terminated_length": 4017.149658203125, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "entropy": 0.20627272129058838, "epoch": 0.9887820512820513, "frac_reward_zero_std": 0.0, "grad_norm": 218.53688049316406, "learning_rate": 1e-06, "loss": 0.0771, "num_tokens": 596455135.0, "reward": 0.16188350319862366, "reward_std": 0.10645517706871033, "rewards/progression_diversity/mean": -0.0030562267638742924, "rewards/progression_diversity/std": 0.020391173660755157, "rewards/symbolic_reward_accuracy/mean": 0.041015625, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.484375, "rewards/symbolic_reward_partial_score/std": 0.23614925146102905, "rewards/tag_count_reward/mean": -0.080078125, "rewards/tag_count_reward/std": 0.271679550409317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.029923439025879, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 5.815011024475098, "step": 617 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.21643763035535812, "epoch": 0.9903846153846154, "grad_norm": 8340.802734375, "learning_rate": 1e-06, "loss": 0.641, "step": 618 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.21413151174783707, "epoch": 0.9919871794871795, "grad_norm": 0.045055363327264786, "learning_rate": 1e-06, "loss": 0.0645, "step": 619 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.20299791544675827, "epoch": 0.9935897435897436, "grad_norm": 2.9305126667022705, "learning_rate": 1e-06, "loss": 0.1387, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9045.0, "completions/mean_length": 4689.990234375, "completions/mean_terminated_length": 3910.389892578125, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "entropy": 0.2107323333621025, "epoch": 0.9951923076923077, "frac_reward_zero_std": 0.0, "grad_norm": 822.1503295898438, "learning_rate": 1e-06, "loss": 0.1441, "num_tokens": 599732858.0, "reward": 0.1449926495552063, "reward_std": 0.08904429525136948, "rewards/progression_diversity/mean": -0.0031782728619873524, "rewards/progression_diversity/std": 0.03115173615515232, "rewards/symbolic_reward_accuracy/mean": 0.02734375, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.4482584595680237, "rewards/symbolic_reward_partial_score/std": 0.2407679557800293, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0356639623641968, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 3.8055062294006348, "step": 621 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.21765735745429993, "epoch": 0.9967948717948718, "grad_norm": 0.11155123263597488, "learning_rate": 1e-06, "loss": 0.074, "step": 622 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2163386046886444, "epoch": 0.9983974358974359, "grad_norm": 0.035584624856710434, "learning_rate": 1e-06, "loss": 0.0425, "step": 623 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2188253104686737, "epoch": 1.0, "grad_norm": 0.041072484105825424, "learning_rate": 1e-06, "loss": 0.127, "step": 624 }, { "epoch": 1.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.05615234375, "eval_completions/max_length": 16384.0, "eval_completions/max_terminated_length": 6932.09375, "eval_completions/mean_length": 4607.2421875, "eval_completions/mean_terminated_length": 3907.518020629883, "eval_completions/min_length": 1083.65625, "eval_completions/min_terminated_length": 1083.65625, "eval_entropy": 0.21419930551201105, "eval_frac_reward_zero_std": 0.0078125, "eval_loss": 0.040855128318071365, "eval_num_tokens": 599732858.0, "eval_reward": 0.15359255392104387, "eval_reward_std": 0.0873756860382855, "eval_rewards/progression_diversity/mean": -0.002012389669403092, "eval_rewards/progression_diversity/std": 0.016667645196321246, "eval_rewards/symbolic_reward_accuracy/mean": 0.02294921875, "eval_rewards/symbolic_reward_accuracy/std": 0.11482575349509716, "eval_rewards/symbolic_reward_partial_score/mean": 0.4836405459791422, "eval_rewards/symbolic_reward_partial_score/std": 0.2179896729066968, "eval_rewards/tag_count_reward/mean": -0.052490234375, "eval_rewards/tag_count_reward/std": 0.21463490999303758, "eval_runtime": 4410.5978, "eval_samples_per_second": 0.057, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0419876985251904, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 687.4686508178711, "eval_sampling/sampling_logp_difference/mean": 2.2834634024184197, "eval_steps_per_second": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7068.0, "completions/mean_length": 4668.310546875, "completions/mean_terminated_length": 3861.175537109375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.2172800451517105, "epoch": 1.001602564102564, "frac_reward_zero_std": 0.0, "grad_norm": 199.8529510498047, "learning_rate": 1e-06, "loss": 0.0884, "num_tokens": 603060937.0, "reward": 0.1484626829624176, "reward_std": 0.08050423860549927, "rewards/progression_diversity/mean": -0.001877149916253984, "rewards/progression_diversity/std": 0.01999068818986416, "rewards/symbolic_reward_accuracy/mean": 0.01953125, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.4773600399494171, "rewards/symbolic_reward_partial_score/std": 0.22182992100715637, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0379219055175781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 2.98622727394104, "step": 625 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.22022734582424164, "epoch": 1.0032051282051282, "grad_norm": 20.245939254760742, "learning_rate": 1e-06, "loss": 0.0854, "step": 626 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22906509041786194, "epoch": 1.0048076923076923, "grad_norm": 0.04052429273724556, "learning_rate": 1e-06, "loss": 0.0291, "step": 627 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.3359375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.20621829479932785, "epoch": 1.0064102564102564, "grad_norm": 0.6211011409759521, "learning_rate": 1e-06, "loss": 0.1451, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6794.0, "completions/mean_length": 4257.8671875, "completions/mean_terminated_length": 3764.934814453125, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "entropy": 0.2293422520160675, "epoch": 1.0080128205128205, "frac_reward_zero_std": 0.0, "grad_norm": 39.52851104736328, "learning_rate": 1e-06, "loss": 0.0808, "num_tokens": 606104485.0, "reward": 0.172392338514328, "reward_std": 0.10835404694080353, "rewards/progression_diversity/mean": -0.001978690503165126, "rewards/progression_diversity/std": 0.029427627101540565, "rewards/symbolic_reward_accuracy/mean": 0.046875, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.4920247495174408, "rewards/symbolic_reward_partial_score/std": 0.22508575022220612, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044852375984192, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 1.5544822216033936, "step": 629 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.23360887169837952, "epoch": 1.0096153846153846, "grad_norm": 0.04223166033625603, "learning_rate": 1e-06, "loss": 0.0465, "step": 630 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2338651716709137, "epoch": 1.0112179487179487, "grad_norm": 5.134727478027344, "learning_rate": 1e-06, "loss": 0.0475, "step": 631 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.22925083339214325, "epoch": 1.0128205128205128, "grad_norm": 0.20785175263881683, "learning_rate": 1e-06, "loss": 0.0922, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 7371.0, "completions/mean_length": 4059.546875, "completions/mean_terminated_length": 3713.076171875, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "entropy": 0.23891044408082962, "epoch": 1.0144230769230769, "frac_reward_zero_std": 0.0, "grad_norm": 379.1642150878906, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 609025469.0, "reward": 0.19592517614364624, "reward_std": 0.10350589454174042, "rewards/progression_diversity/mean": -0.0012319717789068818, "rewards/progression_diversity/std": 0.013662992045283318, "rewards/symbolic_reward_accuracy/mean": 0.078125, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.5053385496139526, "rewards/symbolic_reward_partial_score/std": 0.2347535789012909, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0454130172729492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 1.7672368288040161, "step": 633 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.23257441073656082, "epoch": 1.016025641025641, "grad_norm": 0.036344923079013824, "learning_rate": 1e-06, "loss": 0.0181, "step": 634 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.22513434290885925, "epoch": 1.017628205128205, "grad_norm": 0.2888374924659729, "learning_rate": 1e-06, "loss": 0.1695, "step": 635 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.23959070444107056, "epoch": 1.0192307692307692, "grad_norm": 0.04031003266572952, "learning_rate": 1e-06, "loss": 0.0214, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6660.0, "completions/mean_length": 4129.09375, "completions/mean_terminated_length": 3784.578125, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "entropy": 0.22954071313142776, "epoch": 1.0208333333333333, "frac_reward_zero_std": 0.03125, "grad_norm": 803.85888671875, "learning_rate": 1e-06, "loss": 0.0414, "num_tokens": 611954541.0, "reward": 0.17702984809875488, "reward_std": 0.10909046232700348, "rewards/progression_diversity/mean": -0.0016061984933912754, "rewards/progression_diversity/std": 0.013535745441913605, "rewards/symbolic_reward_accuracy/mean": 0.046875, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.5055176019668579, "rewards/symbolic_reward_partial_score/std": 0.2366805523633957, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.040963053703308, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 4.412889003753662, "step": 637 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23653806746006012, "epoch": 1.0224358974358974, "grad_norm": 0.032999370247125626, "learning_rate": 1e-06, "loss": 0.0386, "step": 638 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.23699220269918442, "epoch": 1.0240384615384615, "grad_norm": 0.044161055237054825, "learning_rate": 1e-06, "loss": 0.0856, "step": 639 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2416703775525093, "epoch": 1.0256410256410255, "grad_norm": 0.04160209372639656, "learning_rate": 1e-06, "loss": -0.0036, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14949.0, "completions/mean_length": 3966.068359375, "completions/mean_terminated_length": 3616.9697265625, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "entropy": 0.2398141846060753, "epoch": 1.0272435897435896, "frac_reward_zero_std": 0.0, "grad_norm": 1.787557601928711, "learning_rate": 1e-06, "loss": 0.0664, "num_tokens": 614793424.0, "reward": 0.1760445237159729, "reward_std": 0.09620662778615952, "rewards/progression_diversity/mean": -0.001016853959299624, "rewards/progression_diversity/std": 0.013255119323730469, "rewards/symbolic_reward_accuracy/mean": 0.04296875, "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, "rewards/symbolic_reward_partial_score/mean": 0.5093749761581421, "rewards/symbolic_reward_partial_score/std": 0.20591585338115692, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0430570840835571, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 3.0360569953918457, "step": 641 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.23307596892118454, "epoch": 1.0288461538461537, "grad_norm": 0.9455596208572388, "learning_rate": 1e-06, "loss": 0.0624, "step": 642 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.23648811876773834, "epoch": 1.0304487179487178, "grad_norm": 0.03208564594388008, "learning_rate": 1e-06, "loss": 0.0013, "step": 643 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.23585931956768036, "epoch": 1.032051282051282, "grad_norm": 0.05858050286769867, "learning_rate": 1e-06, "loss": 0.087, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6913.0, "completions/mean_length": 3554.791015625, "completions/mean_terminated_length": 3194.13037109375, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "entropy": 0.23475763201713562, "epoch": 1.0336538461538463, "frac_reward_zero_std": 0.0, "grad_norm": 2.5429258346557617, "learning_rate": 1e-06, "loss": 0.0621, "num_tokens": 617548181.0, "reward": 0.1708458811044693, "reward_std": 0.0782819613814354, "rewards/progression_diversity/mean": -0.0018388191238045692, "rewards/progression_diversity/std": 0.020668642595410347, "rewards/symbolic_reward_accuracy/mean": 0.02734375, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.5226725339889526, "rewards/symbolic_reward_partial_score/std": 0.20760734379291534, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.043994665145874, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 2.3344595432281494, "step": 645 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.231798954308033, "epoch": 1.0352564102564104, "grad_norm": 0.04138017073273659, "learning_rate": 1e-06, "loss": 0.0803, "step": 646 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.24344021826982498, "epoch": 1.0368589743589745, "grad_norm": 0.03530874475836754, "learning_rate": 1e-06, "loss": 0.0385, "step": 647 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2350650280714035, "epoch": 1.0384615384615385, "grad_norm": 0.035837192088365555, "learning_rate": 1e-06, "loss": 0.0492, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5463.0, "completions/mean_length": 3913.376953125, "completions/mean_terminated_length": 3380.01025390625, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.21108438819646835, "epoch": 1.0400641025641026, "frac_reward_zero_std": 0.03125, "grad_norm": 306.6093444824219, "learning_rate": 1e-06, "loss": 0.151, "num_tokens": 620433494.0, "reward": 0.18476390838623047, "reward_std": 0.11055901646614075, "rewards/progression_diversity/mean": -0.001637741457670927, "rewards/progression_diversity/std": 0.015057054348289967, "rewards/symbolic_reward_accuracy/mean": 0.064453125, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.5006998777389526, "rewards/symbolic_reward_partial_score/std": 0.23833897709846497, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.038619875907898, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 3.4865543842315674, "step": 649 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23450932651758194, "epoch": 1.0416666666666667, "grad_norm": 0.03884749859571457, "learning_rate": 1e-06, "loss": 0.0365, "step": 650 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.22569099068641663, "epoch": 1.0432692307692308, "grad_norm": 0.039085451513528824, "learning_rate": 1e-06, "loss": 0.045, "step": 651 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.22521612793207169, "epoch": 1.044871794871795, "grad_norm": 0.03394408896565437, "learning_rate": 1e-06, "loss": 0.0285, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6388.0, "completions/mean_length": 4014.796875, "completions/mean_terminated_length": 3299.222900390625, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.21447355300188065, "epoch": 1.046474358974359, "frac_reward_zero_std": 0.0, "grad_norm": 3.4653165340423584, "learning_rate": 1e-06, "loss": 0.0665, "num_tokens": 623435790.0, "reward": 0.18457384407520294, "reward_std": 0.12713655829429626, "rewards/progression_diversity/mean": -0.00013509648852050304, "rewards/progression_diversity/std": 0.003056884743273258, "rewards/symbolic_reward_accuracy/mean": 0.05859375, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.5162923336029053, "rewards/symbolic_reward_partial_score/std": 0.25753483176231384, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0391998291015625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.8635755777359009, "step": 653 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.21595758199691772, "epoch": 1.0480769230769231, "grad_norm": 839.4369506835938, "learning_rate": 1e-06, "loss": 0.1091, "step": 654 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.20685593783855438, "epoch": 1.0496794871794872, "grad_norm": 0.036022186279296875, "learning_rate": 1e-06, "loss": 0.1617, "step": 655 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.21082329750061035, "epoch": 1.0512820512820513, "grad_norm": 0.031025860458612442, "learning_rate": 1e-06, "loss": 0.0654, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 6231.0, "completions/mean_length": 4426.666015625, "completions/mean_terminated_length": 3302.472412109375, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "entropy": 0.20376238226890564, "epoch": 1.0528846153846154, "frac_reward_zero_std": 0.03125, "grad_norm": 223.56224060058594, "learning_rate": 1e-06, "loss": 0.0537, "num_tokens": 626540099.0, "reward": 0.1801775097846985, "reward_std": 0.12979283928871155, "rewards/progression_diversity/mean": -0.00031599291833117604, "rewards/progression_diversity/std": 0.005377059802412987, "rewards/symbolic_reward_accuracy/mean": 0.064453125, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.5003417730331421, "rewards/symbolic_reward_partial_score/std": 0.23511230945587158, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0354280471801758, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 1.9148916006088257, "step": 657 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.20688576996326447, "epoch": 1.0544871794871795, "grad_norm": 141.19863891601562, "learning_rate": 1e-06, "loss": 0.145, "step": 658 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.20776833593845367, "epoch": 1.0560897435897436, "grad_norm": 4.635189056396484, "learning_rate": 1e-06, "loss": 0.1138, "step": 659 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.21761061251163483, "epoch": 1.0576923076923077, "grad_norm": 0.028185199946165085, "learning_rate": 1e-06, "loss": 0.1123, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16107.0, "completions/mean_length": 4155.037109375, "completions/mean_terminated_length": 3312.540771484375, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "entropy": 0.2150147706270218, "epoch": 1.0592948717948718, "frac_reward_zero_std": 0.03125, "grad_norm": 94.42283630371094, "learning_rate": 1e-06, "loss": 0.0893, "num_tokens": 629565094.0, "reward": 0.16469581425189972, "reward_std": 0.10881762206554413, "rewards/progression_diversity/mean": -0.0006340488907881081, "rewards/progression_diversity/std": 0.008337623439729214, "rewards/symbolic_reward_accuracy/mean": 0.029296875, "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, "rewards/symbolic_reward_partial_score/mean": 0.5112468004226685, "rewards/symbolic_reward_partial_score/std": 0.23792609572410583, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0353161096572876, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 2.2100651264190674, "step": 661 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.21428611874580383, "epoch": 1.060897435897436, "grad_norm": 114.22917938232422, "learning_rate": 1e-06, "loss": 0.1123, "step": 662 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2211027517914772, "epoch": 1.0625, "grad_norm": 0.0436987467110157, "learning_rate": 1e-06, "loss": 0.0534, "step": 663 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.18693063408136368, "epoch": 1.064102564102564, "grad_norm": 2.552537679672241, "learning_rate": 1e-06, "loss": 0.2344, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6130.0, "completions/mean_length": 4010.59765625, "completions/mean_terminated_length": 3294.78076171875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.2113461047410965, "epoch": 1.0657051282051282, "frac_reward_zero_std": 0.0, "grad_norm": 246.3715362548828, "learning_rate": 1e-06, "loss": 0.0674, "num_tokens": 632547896.0, "reward": 0.1795777976512909, "reward_std": 0.10678594559431076, "rewards/progression_diversity/mean": -0.00022982530936133116, "rewards/progression_diversity/std": 0.0034447184298187494, "rewards/symbolic_reward_accuracy/mean": 0.052734375, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.5113606452941895, "rewards/symbolic_reward_partial_score/std": 0.2122807502746582, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0385127067565918, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 1.2567355632781982, "step": 665 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2206530198454857, "epoch": 1.0673076923076923, "grad_norm": 0.2656633257865906, "learning_rate": 1e-06, "loss": 0.0547, "step": 666 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2168770357966423, "epoch": 1.0689102564102564, "grad_norm": 3.0182042121887207, "learning_rate": 1e-06, "loss": 0.125, "step": 667 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.2043883576989174, "epoch": 1.0705128205128205, "grad_norm": 0.024378696456551552, "learning_rate": 1e-06, "loss": 0.1789, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15758.0, "completions/mean_length": 4331.119140625, "completions/mean_terminated_length": 3446.73583984375, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "entropy": 0.20979925245046616, "epoch": 1.0721153846153846, "frac_reward_zero_std": 0.03125, "grad_norm": 180.8795928955078, "learning_rate": 1e-06, "loss": 0.1064, "num_tokens": 635626965.0, "reward": 0.15303614735603333, "reward_std": 0.08873963356018066, "rewards/progression_diversity/mean": -9.739572124090046e-05, "rewards/progression_diversity/std": 0.0017328658141195774, "rewards/symbolic_reward_accuracy/mean": 0.013671875, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.5055663585662842, "rewards/symbolic_reward_partial_score/std": 0.22250817716121674, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.034943699836731, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 2.280566692352295, "step": 669 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.19934061914682388, "epoch": 1.0737179487179487, "grad_norm": 0.06182999163866043, "learning_rate": 1e-06, "loss": 0.1761, "step": 670 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.21370511502027512, "epoch": 1.0753205128205128, "grad_norm": 0.03356883302330971, "learning_rate": 1e-06, "loss": 0.0624, "step": 671 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.20709837973117828, "epoch": 1.0769230769230769, "grad_norm": 7.616127967834473, "learning_rate": 1e-06, "loss": 0.1033, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16086.0, "completions/mean_length": 3967.392578125, "completions/mean_terminated_length": 3488.862060546875, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "entropy": 0.21507354080677032, "epoch": 1.078525641025641, "frac_reward_zero_std": 0.0, "grad_norm": 239.9991912841797, "learning_rate": 1e-06, "loss": 0.0799, "num_tokens": 638507678.0, "reward": 0.22426804900169373, "reward_std": 0.14125576615333557, "rewards/progression_diversity/mean": -0.00044275925029069185, "rewards/progression_diversity/std": 0.008287720382213593, "rewards/symbolic_reward_accuracy/mean": 0.10546875, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.5490071773529053, "rewards/symbolic_reward_partial_score/std": 0.2383054494857788, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0389597415924072, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 1.678175687789917, "step": 673 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.21100656688213348, "epoch": 1.080128205128205, "grad_norm": 2.742002487182617, "learning_rate": 1e-06, "loss": 0.0992, "step": 674 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.21375902742147446, "epoch": 1.0817307692307692, "grad_norm": 0.04870569705963135, "learning_rate": 1e-06, "loss": 0.077, "step": 675 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2123679220676422, "epoch": 1.0833333333333333, "grad_norm": 0.03225383535027504, "learning_rate": 1e-06, "loss": 0.0213, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7501.0, "completions/mean_length": 3955.189453125, "completions/mean_terminated_length": 3528.341552734375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "entropy": 0.19846639037132263, "epoch": 1.0849358974358974, "frac_reward_zero_std": 0.0, "grad_norm": 93.89030456542969, "learning_rate": 1e-06, "loss": 0.1439, "num_tokens": 641467087.0, "reward": 0.18885713815689087, "reward_std": 0.09211981296539307, "rewards/progression_diversity/mean": -2.9897617423557676e-05, "rewards/progression_diversity/std": 0.0006765059079043567, "rewards/symbolic_reward_accuracy/mean": 0.05859375, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.5234049558639526, "rewards/symbolic_reward_partial_score/std": 0.22506952285766602, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0395513772964478, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 696.0, "sampling/sampling_logp_difference/mean": 0.8175865411758423, "step": 677 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.21475867927074432, "epoch": 1.0865384615384615, "grad_norm": 0.04221729189157486, "learning_rate": 1e-06, "loss": 0.0553, "step": 678 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2134266495704651, "epoch": 1.0881410256410255, "grad_norm": 0.03336584195494652, "learning_rate": 1e-06, "loss": 0.0467, "step": 679 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.21357693523168564, "epoch": 1.0897435897435896, "grad_norm": 0.08009527623653412, "learning_rate": 1e-06, "loss": 0.0293, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 6588.0, "completions/mean_length": 3841.345703125, "completions/mean_terminated_length": 3410.588134765625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.21339119970798492, "epoch": 1.0913461538461537, "frac_reward_zero_std": 0.0, "grad_norm": 614.2329711914062, "learning_rate": 1e-06, "loss": 0.1081, "num_tokens": 644259776.0, "reward": 0.2062734067440033, "reward_std": 0.1276092380285263, "rewards/progression_diversity/mean": -0.0005905528087168932, "rewards/progression_diversity/std": 0.006680304650217295, "rewards/symbolic_reward_accuracy/mean": 0.076171875, "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, "rewards/symbolic_reward_partial_score/mean": 0.5463216304779053, "rewards/symbolic_reward_partial_score/std": 0.24570463597774506, "rewards/tag_count_reward/mean": -0.033203125, "rewards/tag_count_reward/std": 0.17934183776378632, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0360360145568848, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 3.0028653144836426, "step": 681 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.21380823105573654, "epoch": 1.092948717948718, "grad_norm": 0.03602517396211624, "learning_rate": 1e-06, "loss": 0.0344, "step": 682 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.21220668405294418, "epoch": 1.094551282051282, "grad_norm": 0.03208322450518608, "learning_rate": 1e-06, "loss": 0.0598, "step": 683 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2145799621939659, "epoch": 1.0961538461538463, "grad_norm": 0.026013914495706558, "learning_rate": 1e-06, "loss": 0.0583, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6038.0, "completions/mean_length": 3467.587890625, "completions/mean_terminated_length": 3131.088134765625, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.21561385691165924, "epoch": 1.0977564102564104, "frac_reward_zero_std": 0.0, "grad_norm": 72.71430206298828, "learning_rate": 1e-06, "loss": 0.0558, "num_tokens": 647003501.0, "reward": 0.19664551317691803, "reward_std": 0.09150449931621552, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.05859375, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.5467610359191895, "rewards/symbolic_reward_partial_score/std": 0.21742354333400726, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0402486324310303, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 696.0, "sampling/sampling_logp_difference/mean": 0.6293116211891174, "step": 685 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.21327327191829681, "epoch": 1.0993589743589745, "grad_norm": 0.03850381448864937, "learning_rate": 1e-06, "loss": 0.0258, "step": 686 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.21018948405981064, "epoch": 1.1009615384615385, "grad_norm": 696.9727783203125, "learning_rate": 1e-06, "loss": 0.2903, "step": 687 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2201865389943123, "epoch": 1.1025641025641026, "grad_norm": 0.04112207144498825, "learning_rate": 1e-06, "loss": 0.0406, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8348.0, "completions/mean_length": 3814.763671875, "completions/mean_terminated_length": 3487.30859375, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "entropy": 0.21837550401687622, "epoch": 1.1041666666666667, "frac_reward_zero_std": 0.0, "grad_norm": 371.62738037109375, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 649739428.0, "reward": 0.1545269787311554, "reward_std": 0.053041648119688034, "rewards/progression_diversity/mean": -0.00042648223461583257, "rewards/progression_diversity/std": 0.006124202162027359, "rewards/symbolic_reward_accuracy/mean": 0.001953125, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.5196614265441895, "rewards/symbolic_reward_partial_score/std": 0.16844609379768372, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0396302938461304, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 2.3374648094177246, "step": 689 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.21820075809955597, "epoch": 1.1057692307692308, "grad_norm": 0.03433378040790558, "learning_rate": 1e-06, "loss": 0.1054, "step": 690 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.21881910413503647, "epoch": 1.107371794871795, "grad_norm": 0.03191978111863136, "learning_rate": 1e-06, "loss": 0.018, "step": 691 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.22215040028095245, "epoch": 1.108974358974359, "grad_norm": 0.02860073745250702, "learning_rate": 1e-06, "loss": 0.0554, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 7847.0, "completions/mean_length": 3363.330078125, "completions/mean_terminated_length": 3103.954345703125, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "entropy": 0.21577662229537964, "epoch": 1.1105769230769231, "frac_reward_zero_std": 0.0, "grad_norm": 299.27655029296875, "learning_rate": 1e-06, "loss": 0.1036, "num_tokens": 652366973.0, "reward": 0.23104004561901093, "reward_std": 0.10753922909498215, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.099609375, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.5774251222610474, "rewards/symbolic_reward_partial_score/std": 0.2207508236169815, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0428193807601929, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 712.0, "sampling/sampling_logp_difference/mean": 0.8698887228965759, "step": 693 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22989095747470856, "epoch": 1.1121794871794872, "grad_norm": 0.051207710057497025, "learning_rate": 1e-06, "loss": -0.0037, "step": 694 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2215583696961403, "epoch": 1.1137820512820513, "grad_norm": 0.03025507554411888, "learning_rate": 1e-06, "loss": 0.0495, "step": 695 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22304167598485947, "epoch": 1.1153846153846154, "grad_norm": 0.03082253411412239, "learning_rate": 1e-06, "loss": 0.017, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6001.0, "completions/mean_length": 3574.69140625, "completions/mean_terminated_length": 3371.369140625, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "entropy": 0.22506634891033173, "epoch": 1.1169871794871795, "frac_reward_zero_std": 0.0, "grad_norm": 0.03984767943620682, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 655022047.0, "reward": 0.17143532633781433, "reward_std": 0.09424548596143723, "rewards/progression_diversity/mean": -2.249992212455254e-05, "rewards/progression_diversity/std": 0.000509115110617131, "rewards/symbolic_reward_accuracy/mean": 0.037109375, "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, "rewards/symbolic_reward_partial_score/mean": 0.50244140625, "rewards/symbolic_reward_partial_score/std": 0.21935398876667023, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0433446168899536, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 1.2426462173461914, "step": 697 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.22180306166410446, "epoch": 1.1185897435897436, "grad_norm": 108.0488052368164, "learning_rate": 1e-06, "loss": 0.0712, "step": 698 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.227100670337677, "epoch": 1.1201923076923077, "grad_norm": 0.034184910356998444, "learning_rate": 1e-06, "loss": 0.039, "step": 699 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2247518002986908, "epoch": 1.1217948717948718, "grad_norm": 0.03340495377779007, "learning_rate": 1e-06, "loss": 0.0312, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 6562.0, "completions/mean_length": 3566.66796875, "completions/mean_terminated_length": 2991.19580078125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.20850242674350739, "epoch": 1.123397435897436, "frac_reward_zero_std": 0.0, "grad_norm": 87.1917495727539, "learning_rate": 1e-06, "loss": 0.1712, "num_tokens": 657830693.0, "reward": 0.1684723198413849, "reward_std": 0.07193771004676819, "rewards/progression_diversity/mean": -0.0009145288495346904, "rewards/progression_diversity/std": 0.013037758879363537, "rewards/symbolic_reward_accuracy/mean": 0.01171875, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.5511881113052368, "rewards/symbolic_reward_partial_score/std": 0.19447360932826996, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0334614515304565, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 4.139370918273926, "step": 701 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.21462332457304, "epoch": 1.125, "grad_norm": 0.03925897926092148, "learning_rate": 1e-06, "loss": 0.1, "step": 702 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2194051519036293, "epoch": 1.126602564102564, "grad_norm": 0.032283466309309006, "learning_rate": 1e-06, "loss": 0.068, "step": 703 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2256844863295555, "epoch": 1.1282051282051282, "grad_norm": 0.027121527120471, "learning_rate": 1e-06, "loss": 0.0355, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 7975.0, "completions/mean_length": 3463.818359375, "completions/mean_terminated_length": 3284.726806640625, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 0.22059781849384308, "epoch": 1.1298076923076923, "frac_reward_zero_std": 0.03125, "grad_norm": 0.042044855654239655, "learning_rate": 1e-06, "loss": 0.0491, "num_tokens": 660390200.0, "reward": 0.19374506175518036, "reward_std": 0.11770647019147873, "rewards/progression_diversity/mean": -6.273954568314366e-06, "rewards/progression_diversity/std": 0.00014196339179761708, "rewards/symbolic_reward_accuracy/mean": 0.06640625, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.5175618529319763, "rewards/symbolic_reward_partial_score/std": 0.2401990294456482, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0451421737670898, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 712.0, "sampling/sampling_logp_difference/mean": 0.5323032736778259, "step": 705 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2260003685951233, "epoch": 1.1314102564102564, "grad_norm": 0.026536772027611732, "learning_rate": 1e-06, "loss": 0.0615, "step": 706 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2393200546503067, "epoch": 1.1330128205128205, "grad_norm": 0.036755435168743134, "learning_rate": 1e-06, "loss": 0.0075, "step": 707 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.23273401707410812, "epoch": 1.1346153846153846, "grad_norm": 0.02972438745200634, "learning_rate": 1e-06, "loss": 0.0061, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5406.0, "completions/mean_length": 3774.552734375, "completions/mean_terminated_length": 3288.590087890625, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "entropy": 0.218715637922287, "epoch": 1.1362179487179487, "frac_reward_zero_std": 0.0, "grad_norm": 1383.7510986328125, "learning_rate": 1e-06, "loss": 0.0569, "num_tokens": 663134643.0, "reward": 0.18339823186397552, "reward_std": 0.08036380261182785, "rewards/progression_diversity/mean": -0.0005094322841614485, "rewards/progression_diversity/std": 0.008040121756494045, "rewards/symbolic_reward_accuracy/mean": 0.041015625, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.5416829586029053, "rewards/symbolic_reward_partial_score/std": 0.21756330132484436, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.038155198097229, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 3.4172184467315674, "step": 709 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.23037738353013992, "epoch": 1.1378205128205128, "grad_norm": 0.028344059363007545, "learning_rate": 1e-06, "loss": 0.0167, "step": 710 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.22201494872570038, "epoch": 1.1394230769230769, "grad_norm": 0.04986124113202095, "learning_rate": 1e-06, "loss": 0.0701, "step": 711 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.22669429332017899, "epoch": 1.141025641025641, "grad_norm": 0.03080112673342228, "learning_rate": 1e-06, "loss": 0.0666, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5973.0, "completions/mean_length": 3245.638671875, "completions/mean_terminated_length": 3089.847900390625, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "entropy": 0.23609895259141922, "epoch": 1.142628205128205, "frac_reward_zero_std": 0.0625, "grad_norm": 188.2184295654297, "learning_rate": 1e-06, "loss": 0.0473, "num_tokens": 665642874.0, "reward": 0.21525543928146362, "reward_std": 0.0937977284193039, "rewards/progression_diversity/mean": -0.000335338176228106, "rewards/progression_diversity/std": 0.005407850258052349, "rewards/symbolic_reward_accuracy/mean": 0.08203125, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.557373046875, "rewards/symbolic_reward_partial_score/std": 0.2241562008857727, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0440669059753418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.9209955930709839, "step": 713 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23579012602567673, "epoch": 1.1442307692307692, "grad_norm": 0.031340159475803375, "learning_rate": 1e-06, "loss": -0.0103, "step": 714 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23112714290618896, "epoch": 1.1458333333333333, "grad_norm": 0.03565249219536781, "learning_rate": 1e-06, "loss": 0.021, "step": 715 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2343195155262947, "epoch": 1.1474358974358974, "grad_norm": 0.03568951413035393, "learning_rate": 1e-06, "loss": 0.0219, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5608.0, "completions/mean_length": 3687.26171875, "completions/mean_terminated_length": 3224.627685546875, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "entropy": 0.23093228042125702, "epoch": 1.1490384615384615, "frac_reward_zero_std": 0.03125, "grad_norm": 25.515024185180664, "learning_rate": 1e-06, "loss": 0.008, "num_tokens": 668387632.0, "reward": 0.1549118608236313, "reward_std": 0.06852319091558456, "rewards/progression_diversity/mean": -0.001002308912575245, "rewards/progression_diversity/std": 0.015932733193039894, "rewards/symbolic_reward_accuracy/mean": 0.015625, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.49687501788139343, "rewards/symbolic_reward_partial_score/std": 0.20396628975868225, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041079044342041, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 1.6645785570144653, "step": 717 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.22284479439258575, "epoch": 1.1506410256410255, "grad_norm": 0.03507667034864426, "learning_rate": 1e-06, "loss": 0.0831, "step": 718 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2239360809326172, "epoch": 1.1522435897435896, "grad_norm": 0.03187626227736473, "learning_rate": 1e-06, "loss": 0.0759, "step": 719 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2218421921133995, "epoch": 1.1538461538461537, "grad_norm": 0.05089287832379341, "learning_rate": 1e-06, "loss": 0.0547, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5998.0, "completions/mean_length": 3646.466796875, "completions/mean_terminated_length": 3314.627197265625, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "entropy": 0.2157435417175293, "epoch": 1.155448717948718, "frac_reward_zero_std": 0.0, "grad_norm": 491.061767578125, "learning_rate": 1e-06, "loss": 0.0744, "num_tokens": 671153135.0, "reward": 0.16443172097206116, "reward_std": 0.08347620069980621, "rewards/progression_diversity/mean": -0.00018694471509661525, "rewards/progression_diversity/std": 0.00423007644712925, "rewards/symbolic_reward_accuracy/mean": 0.025390625, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.5057942867279053, "rewards/symbolic_reward_partial_score/std": 0.22736532986164093, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0419472455978394, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.7793669700622559, "step": 721 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.23054082691669464, "epoch": 1.157051282051282, "grad_norm": 0.038431961089372635, "learning_rate": 1e-06, "loss": 0.0223, "step": 722 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.22331822663545609, "epoch": 1.1586538461538463, "grad_norm": 0.030653027817606926, "learning_rate": 1e-06, "loss": 0.0586, "step": 723 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.22740525007247925, "epoch": 1.1602564102564104, "grad_norm": 0.03620804473757744, "learning_rate": 1e-06, "loss": 0.0345, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5344.0, "completions/mean_length": 3325.42578125, "completions/mean_terminated_length": 3222.602294921875, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "entropy": 0.2329612523317337, "epoch": 1.1618589743589745, "frac_reward_zero_std": 0.03125, "grad_norm": 0.042414553463459015, "learning_rate": 1e-06, "loss": 0.0642, "num_tokens": 673598329.0, "reward": 0.18591231107711792, "reward_std": 0.08399780839681625, "rewards/progression_diversity/mean": -7.887653919169679e-05, "rewards/progression_diversity/std": 0.0017324578948318958, "rewards/symbolic_reward_accuracy/mean": 0.048828125, "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, "rewards/symbolic_reward_partial_score/mean": 0.524658203125, "rewards/symbolic_reward_partial_score/std": 0.23230838775634766, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0474889278411865, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 724.0, "sampling/sampling_logp_difference/mean": 0.47188305854797363, "step": 725 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23176733404397964, "epoch": 1.1634615384615385, "grad_norm": 61.60709762573242, "learning_rate": 1e-06, "loss": 0.0217, "step": 726 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23428219556808472, "epoch": 1.1650641025641026, "grad_norm": 0.038381967693567276, "learning_rate": 1e-06, "loss": 0.0206, "step": 727 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23280168324708939, "epoch": 1.1666666666666667, "grad_norm": 0.02906009368598461, "learning_rate": 1e-06, "loss": -0.0278, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4910.0, "completions/mean_length": 3052.662109375, "completions/mean_terminated_length": 2867.871337890625, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "entropy": 0.2368728667497635, "epoch": 1.1682692307692308, "frac_reward_zero_std": 0.03125, "grad_norm": 0.03455471992492676, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 676083596.0, "reward": 0.19645507633686066, "reward_std": 0.09786352515220642, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.0625, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.5344075560569763, "rewards/symbolic_reward_partial_score/std": 0.22556960582733154, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0460604429244995, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 0.4928211271762848, "step": 729 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2318621650338173, "epoch": 1.169871794871795, "grad_norm": 0.04372532665729523, "learning_rate": 1e-06, "loss": -0.0078, "step": 730 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23245908319950104, "epoch": 1.171474358974359, "grad_norm": 0.10342668741941452, "learning_rate": 1e-06, "loss": 0.0665, "step": 731 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2263612002134323, "epoch": 1.1730769230769231, "grad_norm": 36.62001419067383, "learning_rate": 1e-06, "loss": 0.1059, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5687.0, "completions/mean_length": 3134.9296875, "completions/mean_terminated_length": 2951.279296875, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "entropy": 0.23260176926851273, "epoch": 1.1746794871794872, "frac_reward_zero_std": 0.03125, "grad_norm": 0.0437169186770916, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 678543112.0, "reward": 0.1655300259590149, "reward_std": 0.08380260318517685, "rewards/progression_diversity/mean": -0.0007075904868543148, "rewards/progression_diversity/std": 0.012582487426698208, "rewards/symbolic_reward_accuracy/mean": 0.03515625, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.48538413643836975, "rewards/symbolic_reward_partial_score/std": 0.22206903994083405, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.042715311050415, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.9027316570281982, "step": 733 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2210148423910141, "epoch": 1.1762820512820513, "grad_norm": 185.5412139892578, "learning_rate": 1e-06, "loss": 0.0454, "step": 734 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.22284194827079773, "epoch": 1.1778846153846154, "grad_norm": 0.0355803407728672, "learning_rate": 1e-06, "loss": 0.0143, "step": 735 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.22447482496500015, "epoch": 1.1794871794871795, "grad_norm": 0.03734675049781799, "learning_rate": 1e-06, "loss": 0.0278, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5229.0, "completions/mean_length": 2980.09765625, "completions/mean_terminated_length": 2847.909423828125, "completions/min_length": 523.0, "completions/min_terminated_length": 523.0, "entropy": 0.22639226913452148, "epoch": 1.1810897435897436, "frac_reward_zero_std": 0.0, "grad_norm": 0.03936491161584854, "learning_rate": 1e-06, "loss": 0.0262, "num_tokens": 681020794.0, "reward": 0.19800294935703278, "reward_std": 0.10915393382310867, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.052734375, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.5577962398529053, "rewards/symbolic_reward_partial_score/std": 0.2130732387304306, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.043833613395691, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 0.7718756794929504, "step": 737 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.22041287273168564, "epoch": 1.1826923076923077, "grad_norm": 0.025883983820676804, "learning_rate": 1e-06, "loss": -0.0021, "step": 738 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.22687563300132751, "epoch": 1.1842948717948718, "grad_norm": 0.030974598601460457, "learning_rate": 1e-06, "loss": 0.0084, "step": 739 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.22128045558929443, "epoch": 1.185897435897436, "grad_norm": 0.05905633792281151, "learning_rate": 1e-06, "loss": 0.0396, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4864.0, "completions/mean_length": 3153.501953125, "completions/mean_terminated_length": 3023.023681640625, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "entropy": 0.23090147227048874, "epoch": 1.1875, "frac_reward_zero_std": 0.09375, "grad_norm": 331.0104064941406, "learning_rate": 1e-06, "loss": 0.0263, "num_tokens": 683450971.0, "reward": 0.16851550340652466, "reward_std": 0.07939761132001877, "rewards/progression_diversity/mean": -0.000989372143521905, "rewards/progression_diversity/std": 0.013076537288725376, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.48753252625465393, "rewards/symbolic_reward_partial_score/std": 0.20784252882003784, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0429353713989258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 2.1165027618408203, "step": 741 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.22930524498224258, "epoch": 1.189102564102564, "grad_norm": 0.029941659420728683, "learning_rate": 1e-06, "loss": 0.041, "step": 742 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22974687814712524, "epoch": 1.1907051282051282, "grad_norm": 0.025151744484901428, "learning_rate": 1e-06, "loss": 0.0187, "step": 743 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.2339945062994957, "epoch": 1.1923076923076923, "grad_norm": 0.02276208996772766, "learning_rate": 1e-06, "loss": 0.0261, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5222.0, "completions/mean_length": 3041.041015625, "completions/mean_terminated_length": 2909.45361328125, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "entropy": 0.23491770029067993, "epoch": 1.1939102564102564, "frac_reward_zero_std": 0.125, "grad_norm": 0.05074154585599899, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 685896720.0, "reward": 0.21035856008529663, "reward_std": 0.08227989077568054, "rewards/progression_diversity/mean": -0.0007646906888112426, "rewards/progression_diversity/std": 0.010030530393123627, "rewards/symbolic_reward_accuracy/mean": 0.0625, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.5794759392738342, "rewards/symbolic_reward_partial_score/std": 0.1977068930864334, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0442752838134766, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 2.015608072280884, "step": 745 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2312600165605545, "epoch": 1.1955128205128205, "grad_norm": 0.03379121422767639, "learning_rate": 1e-06, "loss": 0.0464, "step": 746 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2323080375790596, "epoch": 1.1971153846153846, "grad_norm": 0.02059108205139637, "learning_rate": 1e-06, "loss": 0.0359, "step": 747 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2359512448310852, "epoch": 1.1987179487179487, "grad_norm": 0.026128700003027916, "learning_rate": 1e-06, "loss": -0.0019, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5081.0, "completions/mean_length": 3201.248046875, "completions/mean_terminated_length": 2748.507080078125, "completions/min_length": 579.0, "completions/min_terminated_length": 579.0, "entropy": 0.21788237243890762, "epoch": 1.2003205128205128, "frac_reward_zero_std": 0.03125, "grad_norm": 229.65496826171875, "learning_rate": 1e-06, "loss": 0.0528, "num_tokens": 688491791.0, "reward": 0.17160627245903015, "reward_std": 0.09746118634939194, "rewards/progression_diversity/mean": -0.0005057294620200992, "rewards/progression_diversity/std": 0.005976199172437191, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.5030273795127869, "rewards/symbolic_reward_partial_score/std": 0.2094988375902176, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0365022420883179, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 3.8983547687530518, "step": 749 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.23051826655864716, "epoch": 1.2019230769230769, "grad_norm": 0.03440074622631073, "learning_rate": 1e-06, "loss": 0.0177, "step": 750 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.22294911742210388, "epoch": 1.203525641025641, "grad_norm": 0.028672844171524048, "learning_rate": 1e-06, "loss": 0.0646, "step": 751 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2228538542985916, "epoch": 1.205128205128205, "grad_norm": 36.360984802246094, "learning_rate": 1e-06, "loss": 0.0626, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5348.0, "completions/mean_length": 2809.646484375, "completions/mean_terminated_length": 2729.640625, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "entropy": 0.23874470591545105, "epoch": 1.2067307692307692, "frac_reward_zero_std": 0.0, "grad_norm": 0.05420081317424774, "learning_rate": 1e-06, "loss": 0.0125, "num_tokens": 690740314.0, "reward": 0.1803320348262787, "reward_std": 0.08125782012939453, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.041015625, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.5210286378860474, "rewards/symbolic_reward_partial_score/std": 0.19359782338142395, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0487172603607178, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 0.39224666357040405, "step": 753 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2426346391439438, "epoch": 1.2083333333333333, "grad_norm": 106.4404296875, "learning_rate": 1e-06, "loss": 0.0278, "step": 754 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.23718024045228958, "epoch": 1.2099358974358974, "grad_norm": 0.03360147029161453, "learning_rate": 1e-06, "loss": 0.0182, "step": 755 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.24055323004722595, "epoch": 1.2115384615384615, "grad_norm": 0.028352908790111542, "learning_rate": 1e-06, "loss": -0.0133, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4704.0, "completions/mean_length": 2882.4375, "completions/mean_terminated_length": 2585.99609375, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "entropy": 0.2231486737728119, "epoch": 1.2131410256410255, "frac_reward_zero_std": 0.03125, "grad_norm": 601.402587890625, "learning_rate": 1e-06, "loss": 0.0973, "num_tokens": 693148362.0, "reward": 0.25671058893203735, "reward_std": 0.10707536339759827, "rewards/progression_diversity/mean": -0.000815525883808732, "rewards/progression_diversity/std": 0.010911921970546246, "rewards/symbolic_reward_accuracy/mean": 0.134765625, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.5927083492279053, "rewards/symbolic_reward_partial_score/std": 0.24597984552383423, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0398728847503662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 3.051284074783325, "step": 757 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.23185677826404572, "epoch": 1.2147435897435896, "grad_norm": 0.03557441011071205, "learning_rate": 1e-06, "loss": 0.0471, "step": 758 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.23132619261741638, "epoch": 1.2163461538461537, "grad_norm": 0.018664143979549408, "learning_rate": 1e-06, "loss": 0.0251, "step": 759 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2319474294781685, "epoch": 1.217948717948718, "grad_norm": 0.017118403688073158, "learning_rate": 1e-06, "loss": 0.0058, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5218.0, "completions/mean_length": 2768.322265625, "completions/mean_terminated_length": 2606.87158203125, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 0.2300676926970482, "epoch": 1.219551282051282, "frac_reward_zero_std": 0.0, "grad_norm": 52.74298095703125, "learning_rate": 1e-06, "loss": 0.0637, "num_tokens": 695437359.0, "reward": 0.18998649716377258, "reward_std": 0.09169755131006241, "rewards/progression_diversity/mean": -0.00037375889951363206, "rewards/progression_diversity/std": 0.00622538710013032, "rewards/symbolic_reward_accuracy/mean": 0.0546875, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.52783203125, "rewards/symbolic_reward_partial_score/std": 0.2111833095550537, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045241117477417, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.6731740236282349, "step": 761 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.23460698127746582, "epoch": 1.2211538461538463, "grad_norm": 1694.8343505859375, "learning_rate": 1e-06, "loss": 0.1124, "step": 762 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.24004460126161575, "epoch": 1.2227564102564104, "grad_norm": 0.038948699831962585, "learning_rate": 1e-06, "loss": 0.0137, "step": 763 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.24492079764604568, "epoch": 1.2243589743589745, "grad_norm": 0.02253109961748123, "learning_rate": 1e-06, "loss": 0.0056, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4993.0, "completions/mean_length": 2854.958984375, "completions/mean_terminated_length": 2748.43115234375, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "entropy": 0.2323237955570221, "epoch": 1.2259615384615385, "frac_reward_zero_std": 0.0, "grad_norm": 1602.8712158203125, "learning_rate": 1e-06, "loss": 0.0338, "num_tokens": 697774042.0, "reward": 0.187491774559021, "reward_std": 0.09734513610601425, "rewards/progression_diversity/mean": -0.00033558663562871516, "rewards/progression_diversity/std": 0.005632203537970781, "rewards/symbolic_reward_accuracy/mean": 0.048828125, "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, "rewards/symbolic_reward_partial_score/mean": 0.5299316644668579, "rewards/symbolic_reward_partial_score/std": 0.21817044913768768, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0454214811325073, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.2177436351776123, "step": 765 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22870472818613052, "epoch": 1.2275641025641026, "grad_norm": 0.02481686696410179, "learning_rate": 1e-06, "loss": 0.0111, "step": 766 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2385152205824852, "epoch": 1.2291666666666667, "grad_norm": 0.0412432886660099, "learning_rate": 1e-06, "loss": -0.0112, "step": 767 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.23105958104133606, "epoch": 1.2307692307692308, "grad_norm": 0.04013615846633911, "learning_rate": 1e-06, "loss": 0.0401, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5212.0, "completions/mean_length": 3133.908203125, "completions/mean_terminated_length": 2896.828857421875, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.2316986247897148, "epoch": 1.232371794871795, "frac_reward_zero_std": 0.03125, "grad_norm": 384.80572509765625, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 700265723.0, "reward": 0.19464799761772156, "reward_std": 0.07056570053100586, "rewards/progression_diversity/mean": -0.0005334917223080993, "rewards/progression_diversity/std": 0.00818613264709711, "rewards/symbolic_reward_accuracy/mean": 0.044921875, "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, "rewards/symbolic_reward_partial_score/mean": 0.5648600459098816, "rewards/symbolic_reward_partial_score/std": 0.20340082049369812, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.042616367340088, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 2.089385509490967, "step": 769 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.23712345212697983, "epoch": 1.233974358974359, "grad_norm": 0.03780589625239372, "learning_rate": 1e-06, "loss": 0.0076, "step": 770 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2268049567937851, "epoch": 1.2355769230769231, "grad_norm": 0.028598081320524216, "learning_rate": 1e-06, "loss": 0.0633, "step": 771 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.22704008221626282, "epoch": 1.2371794871794872, "grad_norm": 0.036604560911655426, "learning_rate": 1e-06, "loss": 0.0556, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5111.0, "completions/mean_length": 2862.83203125, "completions/mean_terminated_length": 2729.4873046875, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 0.23467671126127243, "epoch": 1.2387820512820513, "frac_reward_zero_std": 0.0625, "grad_norm": 248.04339599609375, "learning_rate": 1e-06, "loss": 0.0378, "num_tokens": 702571989.0, "reward": 0.16636018455028534, "reward_std": 0.07103414833545685, "rewards/progression_diversity/mean": -0.00021245863172225654, "rewards/progression_diversity/std": 0.004727643448859453, "rewards/symbolic_reward_accuracy/mean": 0.017578125, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.5232909917831421, "rewards/symbolic_reward_partial_score/std": 0.18974992632865906, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0468013286590576, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.2262119054794312, "step": 773 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2362857237458229, "epoch": 1.2403846153846154, "grad_norm": 0.048055924475193024, "learning_rate": 1e-06, "loss": 0.0191, "step": 774 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.23780542612075806, "epoch": 1.2419871794871795, "grad_norm": 0.02682042494416237, "learning_rate": 1e-06, "loss": -0.0151, "step": 775 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2368232011795044, "epoch": 1.2435897435897436, "grad_norm": 0.09740027785301208, "learning_rate": 1e-06, "loss": 0.026, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5104.0, "completions/mean_length": 2685.009765625, "completions/mean_terminated_length": 2577.143798828125, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "entropy": 0.2424686774611473, "epoch": 1.2451923076923077, "frac_reward_zero_std": 0.03125, "grad_norm": 399.4440612792969, "learning_rate": 1e-06, "loss": 0.0333, "num_tokens": 704759194.0, "reward": 0.1672852635383606, "reward_std": 0.0681874230504036, "rewards/progression_diversity/mean": -0.0009661708609201014, "rewards/progression_diversity/std": 0.01556091196835041, "rewards/symbolic_reward_accuracy/mean": 0.025390625, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.5088216066360474, "rewards/symbolic_reward_partial_score/std": 0.19267694652080536, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0457208156585693, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 2.5860910415649414, "step": 777 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.24405201524496078, "epoch": 1.2467948717948718, "grad_norm": 0.040985848754644394, "learning_rate": 1e-06, "loss": 0.0082, "step": 778 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.24193061143159866, "epoch": 1.248397435897436, "grad_norm": 0.029141481965780258, "learning_rate": 1e-06, "loss": 0.0323, "step": 779 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.24474172294139862, "epoch": 1.25, "grad_norm": 0.03860503062605858, "learning_rate": 1e-06, "loss": 0.005, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4847.0, "completions/mean_length": 2609.923828125, "completions/mean_terminated_length": 2555.907958984375, "completions/min_length": 534.0, "completions/min_terminated_length": 534.0, "entropy": 0.23170283436775208, "epoch": 1.251602564102564, "frac_reward_zero_std": 0.09375, "grad_norm": 0.0442928783595562, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 707031571.0, "reward": 0.18274357914924622, "reward_std": 0.06679178774356842, "rewards/progression_diversity/mean": -0.0005459238309413195, "rewards/progression_diversity/std": 0.008116460405290127, "rewards/symbolic_reward_accuracy/mean": 0.0234375, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.5635905265808105, "rewards/symbolic_reward_partial_score/std": 0.20824402570724487, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045533299446106, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.4784348011016846, "step": 781 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.23342838883399963, "epoch": 1.2532051282051282, "grad_norm": 0.03521363064646721, "learning_rate": 1e-06, "loss": 0.0218, "step": 782 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2293540984392166, "epoch": 1.2548076923076923, "grad_norm": 0.028306540101766586, "learning_rate": 1e-06, "loss": -0.0201, "step": 783 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.22597584128379822, "epoch": 1.2564102564102564, "grad_norm": 0.024497980251908302, "learning_rate": 1e-06, "loss": 0.0313, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4343.0, "completions/mean_length": 2555.701171875, "completions/mean_terminated_length": 2474.198486328125, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.22865238785743713, "epoch": 1.2580128205128205, "frac_reward_zero_std": 0.0625, "grad_norm": 0.03943171352148056, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 709195418.0, "reward": 0.193578839302063, "reward_std": 0.087828129529953, "rewards/progression_diversity/mean": -2.721707096497994e-05, "rewards/progression_diversity/std": 0.0006158520118333399, "rewards/symbolic_reward_accuracy/mean": 0.048828125, "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, "rewards/symbolic_reward_partial_score/mean": 0.549560546875, "rewards/symbolic_reward_partial_score/std": 0.19566665589809418, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045490026473999, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.5368279218673706, "step": 785 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.22690270096063614, "epoch": 1.2596153846153846, "grad_norm": 0.0247288029640913, "learning_rate": 1e-06, "loss": 0.0111, "step": 786 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2224835455417633, "epoch": 1.2612179487179487, "grad_norm": 0.03503105789422989, "learning_rate": 1e-06, "loss": 0.0801, "step": 787 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22508803755044937, "epoch": 1.2628205128205128, "grad_norm": 0.033731210976839066, "learning_rate": 1e-06, "loss": -0.0029, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4441.0, "completions/mean_length": 2418.279296875, "completions/mean_terminated_length": 2280.55029296875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 0.2210671380162239, "epoch": 1.2644230769230769, "frac_reward_zero_std": 0.0625, "grad_norm": 72.21678924560547, "learning_rate": 1e-06, "loss": 0.0407, "num_tokens": 711362041.0, "reward": 0.17944911122322083, "reward_std": 0.08195337653160095, "rewards/progression_diversity/mean": -0.00040262757102027535, "rewards/progression_diversity/std": 0.006931956857442856, "rewards/symbolic_reward_accuracy/mean": 0.03125, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.5389323234558105, "rewards/symbolic_reward_partial_score/std": 0.18118998408317566, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0432648658752441, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.1448458433151245, "step": 789 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.22485744953155518, "epoch": 1.266025641025641, "grad_norm": 0.02760397456586361, "learning_rate": 1e-06, "loss": 0.021, "step": 790 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.22400467097759247, "epoch": 1.267628205128205, "grad_norm": 0.031739212572574615, "learning_rate": 1e-06, "loss": 0.0194, "step": 791 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2254156917333603, "epoch": 1.2692307692307692, "grad_norm": 0.03261100500822067, "learning_rate": 1e-06, "loss": 0.0151, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4488.0, "completions/mean_length": 2580.109375, "completions/mean_terminated_length": 2388.768310546875, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "entropy": 0.2238694727420807, "epoch": 1.2708333333333333, "frac_reward_zero_std": 0.03125, "grad_norm": 277.4549255371094, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 713579937.0, "reward": 0.18201328814029694, "reward_std": 0.0664268359541893, "rewards/progression_diversity/mean": -0.0013090292923152447, "rewards/progression_diversity/std": 0.013768412172794342, "rewards/symbolic_reward_accuracy/mean": 0.021484375, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.5670410394668579, "rewards/symbolic_reward_partial_score/std": 0.1917123794555664, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0389268398284912, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 2.950878143310547, "step": 793 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2173541858792305, "epoch": 1.2724358974358974, "grad_norm": 745.087890625, "learning_rate": 1e-06, "loss": 0.0668, "step": 794 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.22230473905801773, "epoch": 1.2740384615384617, "grad_norm": 0.03551755100488663, "learning_rate": 1e-06, "loss": 0.0133, "step": 795 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.22595015913248062, "epoch": 1.2756410256410255, "grad_norm": 0.025852041319012642, "learning_rate": 1e-06, "loss": -0.0077, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4958.0, "completions/mean_length": 2487.400390625, "completions/mean_terminated_length": 2405.4951171875, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "entropy": 0.22661074995994568, "epoch": 1.2772435897435899, "frac_reward_zero_std": 0.0625, "grad_norm": 0.04282950237393379, "learning_rate": 1e-06, "loss": -0.0107, "num_tokens": 715731470.0, "reward": 0.24059391021728516, "reward_std": 0.08692439645528793, "rewards/progression_diversity/mean": -0.00017893135373014957, "rewards/progression_diversity/std": 0.0020902305841445923, "rewards/symbolic_reward_accuracy/mean": 0.10546875, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.5930013060569763, "rewards/symbolic_reward_partial_score/std": 0.2335280328989029, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.043853759765625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.3285684883594513, "step": 797 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2201601192355156, "epoch": 1.2788461538461537, "grad_norm": 0.03756594657897949, "learning_rate": 1e-06, "loss": 0.0067, "step": 798 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2165718972682953, "epoch": 1.280448717948718, "grad_norm": 0.02026909776031971, "learning_rate": 1e-06, "loss": 0.0363, "step": 799 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2138141244649887, "epoch": 1.282051282051282, "grad_norm": 0.021396497264504433, "learning_rate": 1e-06, "loss": 0.0612, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5352.0, "completions/mean_length": 2308.837890625, "completions/mean_terminated_length": 2198.009765625, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 0.2189565673470497, "epoch": 1.2836538461538463, "frac_reward_zero_std": 0.125, "grad_norm": 0.04113735631108284, "learning_rate": 1e-06, "loss": -0.0137, "num_tokens": 717814763.0, "reward": 0.18618622422218323, "reward_std": 0.06758680939674377, "rewards/progression_diversity/mean": -0.0005197992431931198, "rewards/progression_diversity/std": 0.008483768440783024, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.545117199420929, "rewards/symbolic_reward_partial_score/std": 0.2002282440662384, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0427607297897339, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.610470175743103, "step": 801 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.21421056240797043, "epoch": 1.2852564102564101, "grad_norm": 0.02349807135760784, "learning_rate": 1e-06, "loss": 0.0212, "step": 802 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.21766190230846405, "epoch": 1.2868589743589745, "grad_norm": 0.024225391447544098, "learning_rate": 1e-06, "loss": 0.0283, "step": 803 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.22310616821050644, "epoch": 1.2884615384615383, "grad_norm": 0.05020375922322273, "learning_rate": 1e-06, "loss": 0.0196, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4344.0, "completions/max_terminated_length": 4344.0, "completions/mean_length": 2237.517578125, "completions/mean_terminated_length": 2237.517578125, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.22092020511627197, "epoch": 1.2900641025641026, "frac_reward_zero_std": 0.03125, "grad_norm": 0.034722164273262024, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 719703236.0, "reward": 0.24485130608081818, "reward_std": 0.12719884514808655, "rewards/progression_diversity/mean": -0.00022179064399097115, "rewards/progression_diversity/std": 0.003463014727458358, "rewards/symbolic_reward_accuracy/mean": 0.11328125, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.5896158814430237, "rewards/symbolic_reward_partial_score/std": 0.21620358526706696, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0438671112060547, "sampling/importance_sampling_ratio/min": 1.0395459602315473e-15, "sampling/sampling_logp_difference/max": 34.49999237060547, "sampling/sampling_logp_difference/mean": 0.09283532202243805, "step": 805 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.22089938074350357, "epoch": 1.2916666666666667, "grad_norm": 0.027210041880607605, "learning_rate": 1e-06, "loss": -0.0033, "step": 806 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.21874121576547623, "epoch": 1.2932692307692308, "grad_norm": 0.024275539442896843, "learning_rate": 1e-06, "loss": 0.01, "step": 807 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.34375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.21499209105968475, "epoch": 1.294871794871795, "grad_norm": 0.03609732910990715, "learning_rate": 1e-06, "loss": 0.0045, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4430.0, "completions/mean_length": 2284.5625, "completions/mean_terminated_length": 2173.543212890625, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "entropy": 0.21107368171215057, "epoch": 1.296474358974359, "frac_reward_zero_std": 0.03125, "grad_norm": 0.03534679487347603, "learning_rate": 1e-06, "loss": -0.0077, "num_tokens": 721772084.0, "reward": 0.1930440068244934, "reward_std": 0.0763799324631691, "rewards/progression_diversity/mean": -0.0007751868688501418, "rewards/progression_diversity/std": 0.01335845235735178, "rewards/symbolic_reward_accuracy/mean": 0.046875, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.5523600578308105, "rewards/symbolic_reward_partial_score/std": 0.20780852437019348, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.040164828300476, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.9508891105651855, "step": 809 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.21084369719028473, "epoch": 1.2980769230769231, "grad_norm": 0.031098656356334686, "learning_rate": 1e-06, "loss": 0.0288, "step": 810 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2036626636981964, "epoch": 1.2996794871794872, "grad_norm": 0.04239289462566376, "learning_rate": 1e-06, "loss": 0.0488, "step": 811 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.20902617275714874, "epoch": 1.3012820512820513, "grad_norm": 0.03809356689453125, "learning_rate": 1e-06, "loss": 0.0013, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5018.0, "completions/mean_length": 2127.1875, "completions/mean_terminated_length": 1986.5877685546875, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.20260044187307358, "epoch": 1.3028846153846154, "frac_reward_zero_std": 0.03125, "grad_norm": 216.08346557617188, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 723870292.0, "reward": 0.17801672220230103, "reward_std": 0.07631928473711014, "rewards/progression_diversity/mean": -8.632877870695665e-05, "rewards/progression_diversity/std": 0.0017233239486813545, "rewards/symbolic_reward_accuracy/mean": 0.0234375, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.5497721433639526, "rewards/symbolic_reward_partial_score/std": 0.19805040955543518, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.038082242012024, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.1360535621643066, "step": 813 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.20947977900505066, "epoch": 1.3044871794871795, "grad_norm": 0.03403943032026291, "learning_rate": 1e-06, "loss": 0.0095, "step": 814 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.19642490148544312, "epoch": 1.3060897435897436, "grad_norm": 0.020816028118133545, "learning_rate": 1e-06, "loss": 0.0554, "step": 815 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.20714019238948822, "epoch": 1.3076923076923077, "grad_norm": 0.02245701290667057, "learning_rate": 1e-06, "loss": -0.0076, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4542.0, "completions/mean_length": 2476.328125, "completions/mean_terminated_length": 2339.171630859375, "completions/min_length": 549.0, "completions/min_terminated_length": 549.0, "entropy": 0.20043767243623734, "epoch": 1.3092948717948718, "frac_reward_zero_std": 0.03125, "grad_norm": 203.6943817138672, "learning_rate": 1e-06, "loss": 0.0633, "num_tokens": 725929244.0, "reward": 0.2166806012392044, "reward_std": 0.10392580181360245, "rewards/progression_diversity/mean": -0.0003975919389631599, "rewards/progression_diversity/std": 0.004248625598847866, "rewards/symbolic_reward_accuracy/mean": 0.087890625, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.549755871295929, "rewards/symbolic_reward_partial_score/std": 0.21964266896247864, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.03891921043396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.3425063192844391, "step": 817 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2018059492111206, "epoch": 1.310897435897436, "grad_norm": 0.02881396934390068, "learning_rate": 1e-06, "loss": 0.0388, "step": 818 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.20520929992198944, "epoch": 1.3125, "grad_norm": 0.029828235507011414, "learning_rate": 1e-06, "loss": 0.0124, "step": 819 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.20675267279148102, "epoch": 1.314102564102564, "grad_norm": 0.02294883131980896, "learning_rate": 1e-06, "loss": -0.0073, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4985.0, "completions/mean_length": 2255.87109375, "completions/mean_terminated_length": 2228.22314453125, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "entropy": 0.20751511305570602, "epoch": 1.3157051282051282, "frac_reward_zero_std": 0.0, "grad_norm": 0.03021460585296154, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 727907322.0, "reward": 0.21369871497154236, "reward_std": 0.09200935065746307, "rewards/progression_diversity/mean": -0.00024685371317900717, "rewards/progression_diversity/std": 0.004451336804777384, "rewards/symbolic_reward_accuracy/mean": 0.078125, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.5567382574081421, "rewards/symbolic_reward_partial_score/std": 0.222740039229393, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.040090799331665, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 0.3165525197982788, "step": 821 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.20631252229213715, "epoch": 1.3173076923076923, "grad_norm": 0.021766817197203636, "learning_rate": 1e-06, "loss": -0.0018, "step": 822 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.20735712349414825, "epoch": 1.3189102564102564, "grad_norm": 0.024145064875483513, "learning_rate": 1e-06, "loss": 0.0377, "step": 823 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.20985466986894608, "epoch": 1.3205128205128205, "grad_norm": 0.030014174059033394, "learning_rate": 1e-06, "loss": -0.0084, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4514.0, "completions/mean_length": 2213.6015625, "completions/mean_terminated_length": 2130.08251953125, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.21330777555704117, "epoch": 1.3221153846153846, "frac_reward_zero_std": 0.03125, "grad_norm": 0.032049596309661865, "learning_rate": 1e-06, "loss": 0.0345, "num_tokens": 729936686.0, "reward": 0.18624354898929596, "reward_std": 0.09241214394569397, "rewards/progression_diversity/mean": -0.000645598629489541, "rewards/progression_diversity/std": 0.011169749312102795, "rewards/symbolic_reward_accuracy/mean": 0.0546875, "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, "rewards/symbolic_reward_partial_score/mean": 0.5134114027023315, "rewards/symbolic_reward_partial_score/std": 0.2185731679201126, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0392979383468628, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.1306374073028564, "step": 825 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.20698269456624985, "epoch": 1.3237179487179487, "grad_norm": 382.0038757324219, "learning_rate": 1e-06, "loss": 0.04, "step": 826 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2093319445848465, "epoch": 1.3253205128205128, "grad_norm": 0.02666737698018551, "learning_rate": 1e-06, "loss": 0.0096, "step": 827 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2111140564084053, "epoch": 1.3269230769230769, "grad_norm": 172.5316925048828, "learning_rate": 1e-06, "loss": 0.0007, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4570.0, "completions/mean_length": 2048.4921875, "completions/mean_terminated_length": 2020.4383544921875, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.22205299139022827, "epoch": 1.328525641025641, "frac_reward_zero_std": 0.03125, "grad_norm": 0.03405119851231575, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 731851850.0, "reward": 0.20125804841518402, "reward_std": 0.0859595537185669, "rewards/progression_diversity/mean": -0.00017142272554337978, "rewards/progression_diversity/std": 0.0027693838346749544, "rewards/symbolic_reward_accuracy/mean": 0.056640625, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.5582357048988342, "rewards/symbolic_reward_partial_score/std": 0.21341325342655182, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0437791347503662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.45466187596321106, "step": 829 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.22545113414525986, "epoch": 1.330128205128205, "grad_norm": 0.03629428148269653, "learning_rate": 1e-06, "loss": 0.0079, "step": 830 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.22470547258853912, "epoch": 1.3317307692307692, "grad_norm": 0.02873636782169342, "learning_rate": 1e-06, "loss": 0.0038, "step": 831 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2245195508003235, "epoch": 1.3333333333333333, "grad_norm": 0.027126438915729523, "learning_rate": 1e-06, "loss": 0.0426, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8032.0, "completions/mean_length": 1912.681640625, "completions/mean_terminated_length": 1884.362060546875, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.2240256890654564, "epoch": 1.3349358974358974, "frac_reward_zero_std": 0.09375, "grad_norm": 0.03682028129696846, "learning_rate": 1e-06, "loss": 0.0219, "num_tokens": 733749047.0, "reward": 0.20436623692512512, "reward_std": 0.07025286555290222, "rewards/progression_diversity/mean": -0.0003872342931572348, "rewards/progression_diversity/std": 0.005586910527199507, "rewards/symbolic_reward_accuracy/mean": 0.048828125, "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, "rewards/symbolic_reward_partial_score/mean": 0.584228515625, "rewards/symbolic_reward_partial_score/std": 0.201664999127388, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0452286005020142, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.28562289476394653, "step": 833 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.22573984414339066, "epoch": 1.3365384615384617, "grad_norm": 0.023644646629691124, "learning_rate": 1e-06, "loss": -0.0002, "step": 834 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2348889708518982, "epoch": 1.3381410256410255, "grad_norm": 0.0235019288957119, "learning_rate": 1e-06, "loss": -0.0061, "step": 835 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2224627062678337, "epoch": 1.3397435897435899, "grad_norm": 0.029762422665953636, "learning_rate": 1e-06, "loss": 0.0313, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4299.0, "completions/max_terminated_length": 4299.0, "completions/mean_length": 1805.880859375, "completions/mean_terminated_length": 1805.880859375, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.23368220031261444, "epoch": 1.3413461538461537, "frac_reward_zero_std": 0.0, "grad_norm": 0.034925542771816254, "learning_rate": 1e-06, "loss": 0.0211, "num_tokens": 735455482.0, "reward": 0.2690373361110687, "reward_std": 0.11118809878826141, "rewards/progression_diversity/mean": -7.453490979969501e-05, "rewards/progression_diversity/std": 0.0010274524101987481, "rewards/symbolic_reward_accuracy/mean": 0.130859375, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.6350748538970947, "rewards/symbolic_reward_partial_score/std": 0.2342989295721054, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.046823501586914, "sampling/importance_sampling_ratio/min": 0.0010576416971161962, "sampling/sampling_logp_difference/max": 6.85171365737915, "sampling/sampling_logp_difference/mean": 0.0981544554233551, "step": 837 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.23005392402410507, "epoch": 1.342948717948718, "grad_norm": 0.02568940818309784, "learning_rate": 1e-06, "loss": 0.0137, "step": 838 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.22952641546726227, "epoch": 1.344551282051282, "grad_norm": 0.030425578355789185, "learning_rate": 1e-06, "loss": -0.0307, "step": 839 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.22608061879873276, "epoch": 1.3461538461538463, "grad_norm": 0.03146536648273468, "learning_rate": 1e-06, "loss": 0.042, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4922.0, "completions/mean_length": 1662.537109375, "completions/mean_terminated_length": 1604.8060302734375, "completions/min_length": 464.0, "completions/min_terminated_length": 464.0, "entropy": 0.23019438982009888, "epoch": 1.3477564102564101, "frac_reward_zero_std": 0.0, "grad_norm": 86.43042755126953, "learning_rate": 1e-06, "loss": 0.0126, "num_tokens": 737145805.0, "reward": 0.24045243859291077, "reward_std": 0.12287962436676025, "rewards/progression_diversity/mean": -0.0006555759464390576, "rewards/progression_diversity/std": 0.009774475358426571, "rewards/symbolic_reward_accuracy/mean": 0.099609375, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.6036132574081421, "rewards/symbolic_reward_partial_score/std": 0.22525496780872345, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.043199896812439, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 2.1184022426605225, "step": 841 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2274625524878502, "epoch": 1.3493589743589745, "grad_norm": 0.033875465393066406, "learning_rate": 1e-06, "loss": 0.0174, "step": 842 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2375042364001274, "epoch": 1.3509615384615383, "grad_norm": 0.02767942100763321, "learning_rate": 1e-06, "loss": 0.001, "step": 843 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2300783023238182, "epoch": 1.3525641025641026, "grad_norm": 0.01807348243892193, "learning_rate": 1e-06, "loss": 0.0017, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4712.0, "completions/mean_length": 1617.0, "completions/mean_terminated_length": 1559.09033203125, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.22667521983385086, "epoch": 1.3541666666666667, "frac_reward_zero_std": 0.125, "grad_norm": 0.036902934312820435, "learning_rate": 1e-06, "loss": 0.0133, "num_tokens": 738841725.0, "reward": 0.2651147246360779, "reward_std": 0.10891143232584, "rewards/progression_diversity/mean": -0.00024866661988198757, "rewards/progression_diversity/std": 0.002786832395941019, "rewards/symbolic_reward_accuracy/mean": 0.126953125, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.6311197280883789, "rewards/symbolic_reward_partial_score/std": 0.21445629000663757, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0440601110458374, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.6725925803184509, "step": 845 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.22537444531917572, "epoch": 1.3557692307692308, "grad_norm": 0.02038760855793953, "learning_rate": 1e-06, "loss": 0.0638, "step": 846 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2280193567276001, "epoch": 1.357371794871795, "grad_norm": 0.024952072650194168, "learning_rate": 1e-06, "loss": -0.002, "step": 847 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.23109513521194458, "epoch": 1.358974358974359, "grad_norm": 0.020898301154375076, "learning_rate": 1e-06, "loss": 0.0146, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4054.0, "completions/max_terminated_length": 4054.0, "completions/mean_length": 1489.037109375, "completions/mean_terminated_length": 1489.037109375, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.2277429699897766, "epoch": 1.3605769230769231, "frac_reward_zero_std": 0.0625, "grad_norm": 0.028738927096128464, "learning_rate": 1e-06, "loss": 0.004, "num_tokens": 740499376.0, "reward": 0.2145245224237442, "reward_std": 0.07948393374681473, "rewards/progression_diversity/mean": -0.0001856325543485582, "rewards/progression_diversity/std": 0.002533870516344905, "rewards/symbolic_reward_accuracy/mean": 0.060546875, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.593994140625, "rewards/symbolic_reward_partial_score/std": 0.20968253910541534, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045418381690979, "sampling/importance_sampling_ratio/min": 6.543979334111549e-12, "sampling/sampling_logp_difference/max": 25.75247573852539, "sampling/sampling_logp_difference/mean": 0.09526436775922775, "step": 849 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.22307077050209045, "epoch": 1.3621794871794872, "grad_norm": 0.025526469573378563, "learning_rate": 1e-06, "loss": 0.0063, "step": 850 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.22553226351737976, "epoch": 1.3637820512820513, "grad_norm": 0.02305307425558567, "learning_rate": 1e-06, "loss": 0.0134, "step": 851 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.21545638144016266, "epoch": 1.3653846153846154, "grad_norm": 0.02928796596825123, "learning_rate": 1e-06, "loss": 0.0036, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3814.0, "completions/mean_length": 1567.021484375, "completions/mean_terminated_length": 1538.025390625, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.21113787591457367, "epoch": 1.3669871794871795, "frac_reward_zero_std": 0.03125, "grad_norm": 0.04044374078512192, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 742260011.0, "reward": 0.17563673853874207, "reward_std": 0.04823887348175049, "rewards/progression_diversity/mean": -0.00029063018155284226, "rewards/progression_diversity/std": 0.00633732695132494, "rewards/symbolic_reward_accuracy/mean": 0.009765625, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.5659342408180237, "rewards/symbolic_reward_partial_score/std": 0.18250161409378052, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0408705472946167, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.9602516889572144, "step": 853 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.218083955347538, "epoch": 1.3685897435897436, "grad_norm": 0.021193062886595726, "learning_rate": 1e-06, "loss": -0.0058, "step": 854 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.20679837465286255, "epoch": 1.3701923076923077, "grad_norm": 1588.3087158203125, "learning_rate": 1e-06, "loss": 0.0694, "step": 855 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.20584507286548615, "epoch": 1.3717948717948718, "grad_norm": 0.02309674583375454, "learning_rate": 1e-06, "loss": -0.0003, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4411.0, "completions/max_terminated_length": 4411.0, "completions/mean_length": 1414.36328125, "completions/mean_terminated_length": 1414.36328125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.21835298836231232, "epoch": 1.373397435897436, "frac_reward_zero_std": 0.0625, "grad_norm": 0.026670735329389572, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 743893157.0, "reward": 0.23589275777339935, "reward_std": 0.11220350861549377, "rewards/progression_diversity/mean": -8.082183921942487e-05, "rewards/progression_diversity/std": 0.0013526254333555698, "rewards/symbolic_reward_accuracy/mean": 0.1015625, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.5831868648529053, "rewards/symbolic_reward_partial_score/std": 0.22691193222999573, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0431017875671387, "sampling/importance_sampling_ratio/min": 2.427903382340446e-05, "sampling/sampling_logp_difference/max": 10.625897407531738, "sampling/sampling_logp_difference/mean": 0.0924476757645607, "step": 857 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.22078397125005722, "epoch": 1.375, "grad_norm": 0.017304185777902603, "learning_rate": 1e-06, "loss": -0.0035, "step": 858 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.21548201143741608, "epoch": 1.376602564102564, "grad_norm": 0.017954610288143158, "learning_rate": 1e-06, "loss": 0.0101, "step": 859 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.21115773171186447, "epoch": 1.3782051282051282, "grad_norm": 0.02831229567527771, "learning_rate": 1e-06, "loss": -0.0007, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3687.0, "completions/mean_length": 1555.4765625, "completions/mean_terminated_length": 1526.4578857421875, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "entropy": 0.2089003250002861, "epoch": 1.3798076923076923, "frac_reward_zero_std": 0.03125, "grad_norm": 104.46712493896484, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 745612857.0, "reward": 0.18778178095817566, "reward_std": 0.06140269339084625, "rewards/progression_diversity/mean": -0.0006311247125267982, "rewards/progression_diversity/std": 0.005138562526553869, "rewards/symbolic_reward_accuracy/mean": 0.015625, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.5953613519668579, "rewards/symbolic_reward_partial_score/std": 0.1708049327135086, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0414655208587646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 724.0, "sampling/sampling_logp_difference/mean": 0.20548060536384583, "step": 861 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2088572382926941, "epoch": 1.3814102564102564, "grad_norm": 0.025014329701662064, "learning_rate": 1e-06, "loss": 0.0155, "step": 862 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.21202120184898376, "epoch": 1.3830128205128205, "grad_norm": 0.028442082926630974, "learning_rate": 1e-06, "loss": 0.0071, "step": 863 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.21458961814641953, "epoch": 1.3846153846153846, "grad_norm": 0.02302766777575016, "learning_rate": 1e-06, "loss": 0.0094, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4167.0, "completions/mean_length": 1564.603515625, "completions/mean_terminated_length": 1477.2593994140625, "completions/min_length": 416.0, "completions/min_terminated_length": 416.0, "entropy": 0.2116372287273407, "epoch": 1.3862179487179487, "frac_reward_zero_std": 0.03125, "grad_norm": 0.032430339604616165, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 747279262.0, "reward": 0.2634183168411255, "reward_std": 0.09794985502958298, "rewards/progression_diversity/mean": -0.00045274931471794844, "rewards/progression_diversity/std": 0.008904572576284409, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.6300293207168579, "rewards/symbolic_reward_partial_score/std": 0.21683233976364136, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0390100479125977, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.8159689903259277, "step": 865 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.22469650208950043, "epoch": 1.3878205128205128, "grad_norm": 0.018467538058757782, "learning_rate": 1e-06, "loss": 0.0018, "step": 866 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.22149597853422165, "epoch": 1.3894230769230769, "grad_norm": 0.023728886619210243, "learning_rate": 1e-06, "loss": 0.0179, "step": 867 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.22125183045864105, "epoch": 1.391025641025641, "grad_norm": 0.019297420978546143, "learning_rate": 1e-06, "loss": 0.0045, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3547.0, "completions/mean_length": 1396.44921875, "completions/mean_terminated_length": 1367.119384765625, "completions/min_length": 462.0, "completions/min_terminated_length": 462.0, "entropy": 0.23219949007034302, "epoch": 1.392628205128205, "frac_reward_zero_std": 0.125, "grad_norm": 0.035795051604509354, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 748746020.0, "reward": 0.22064433991909027, "reward_std": 0.07329108566045761, "rewards/progression_diversity/mean": -0.0009962395997717977, "rewards/progression_diversity/std": 0.018096894025802612, "rewards/symbolic_reward_accuracy/mean": 0.052734375, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.6306965351104736, "rewards/symbolic_reward_partial_score/std": 0.2060777246952057, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0450772047042847, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.2045083045959473, "step": 869 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23559828102588654, "epoch": 1.3942307692307692, "grad_norm": 0.01611843891441822, "learning_rate": 1e-06, "loss": 0.0068, "step": 870 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.23359952867031097, "epoch": 1.3958333333333333, "grad_norm": 0.02076306752860546, "learning_rate": 1e-06, "loss": -0.0021, "step": 871 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2402142956852913, "epoch": 1.3974358974358974, "grad_norm": 0.017290132120251656, "learning_rate": 1e-06, "loss": -0.0002, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3703.0, "completions/mean_length": 1506.962890625, "completions/mean_terminated_length": 1477.8492431640625, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.22817867249250412, "epoch": 1.3990384615384617, "frac_reward_zero_std": 0.125, "grad_norm": 61.729652404785156, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 750412865.0, "reward": 0.23350869119167328, "reward_std": 0.08862544596195221, "rewards/progression_diversity/mean": -0.00020658165158238262, "rewards/progression_diversity/std": 0.0025547959376126528, "rewards/symbolic_reward_accuracy/mean": 0.08984375, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.5993326902389526, "rewards/symbolic_reward_partial_score/std": 0.23785848915576935, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045426368713379, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 712.0, "sampling/sampling_logp_difference/mean": 0.20375153422355652, "step": 873 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.22932185232639313, "epoch": 1.4006410256410255, "grad_norm": 0.02397080697119236, "learning_rate": 1e-06, "loss": 0.0007, "step": 874 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.229917511343956, "epoch": 1.4022435897435899, "grad_norm": 0.028196433559060097, "learning_rate": 1e-06, "loss": 0.0031, "step": 875 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2285906821489334, "epoch": 1.4038461538461537, "grad_norm": 0.021019669249653816, "learning_rate": 1e-06, "loss": -0.0085, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4288.0, "completions/max_terminated_length": 4288.0, "completions/mean_length": 1584.255859375, "completions/mean_terminated_length": 1584.255859375, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.22845402359962463, "epoch": 1.405448717948718, "frac_reward_zero_std": 0.03125, "grad_norm": 0.1292523443698883, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 752032676.0, "reward": 0.21584956347942352, "reward_std": 0.0788746029138565, "rewards/progression_diversity/mean": -0.0004929413553327322, "rewards/progression_diversity/std": 0.0036183472257107496, "rewards/symbolic_reward_accuracy/mean": 0.0625, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.5945149660110474, "rewards/symbolic_reward_partial_score/std": 0.20550201833248138, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0462896823883057, "sampling/importance_sampling_ratio/min": 0.0012942898320034146, "sampling/sampling_logp_difference/max": 6.6497931480407715, "sampling/sampling_logp_difference/mean": 0.09629081189632416, "step": 877 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2279265597462654, "epoch": 1.407051282051282, "grad_norm": 0.019477983936667442, "learning_rate": 1e-06, "loss": -0.0207, "step": 878 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.22508171200752258, "epoch": 1.4086538461538463, "grad_norm": 0.029449041932821274, "learning_rate": 1e-06, "loss": 0.0071, "step": 879 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2288508340716362, "epoch": 1.4102564102564101, "grad_norm": 0.022348174825310707, "learning_rate": 1e-06, "loss": 0.0093, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4524.0, "completions/mean_length": 1608.572265625, "completions/mean_terminated_length": 1579.657470703125, "completions/min_length": 518.0, "completions/min_terminated_length": 518.0, "entropy": 0.22652538120746613, "epoch": 1.4118589743589745, "frac_reward_zero_std": 0.0625, "grad_norm": 0.0312635712325573, "learning_rate": 1e-06, "loss": 0.018, "num_tokens": 753635801.0, "reward": 0.23253975808620453, "reward_std": 0.08958201110363007, "rewards/progression_diversity/mean": -0.0004195784858893603, "rewards/progression_diversity/std": 0.008441566489636898, "rewards/symbolic_reward_accuracy/mean": 0.08984375, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.5961099863052368, "rewards/symbolic_reward_partial_score/std": 0.23455365002155304, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0431697368621826, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.380308985710144, "step": 881 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.22486486285924911, "epoch": 1.4134615384615383, "grad_norm": 0.023399055004119873, "learning_rate": 1e-06, "loss": 0.0023, "step": 882 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.22499487549066544, "epoch": 1.4150641025641026, "grad_norm": 386.1463317871094, "learning_rate": 1e-06, "loss": 0.0152, "step": 883 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.23033517599105835, "epoch": 1.4166666666666667, "grad_norm": 0.02022228017449379, "learning_rate": 1e-06, "loss": 0.0148, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 1440.396484375, "completions/mean_terminated_length": 1411.152587890625, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.2310120165348053, "epoch": 1.4182692307692308, "frac_reward_zero_std": 0.09375, "grad_norm": 0.029161928221583366, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 755244068.0, "reward": 0.27608346939086914, "reward_std": 0.10152895748615265, "rewards/progression_diversity/mean": -0.0005383668467402458, "rewards/progression_diversity/std": 0.006143567617982626, "rewards/symbolic_reward_accuracy/mean": 0.1484375, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.624072253704071, "rewards/symbolic_reward_partial_score/std": 0.22695200145244598, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0465407371520996, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 700.0, "sampling/sampling_logp_difference/mean": 0.22680458426475525, "step": 885 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.23101823031902313, "epoch": 1.419871794871795, "grad_norm": 0.01696612685918808, "learning_rate": 1e-06, "loss": 0.008, "step": 886 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.22741740196943283, "epoch": 1.421474358974359, "grad_norm": 0.03519902750849724, "learning_rate": 1e-06, "loss": -0.0015, "step": 887 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.22920458763837814, "epoch": 1.4230769230769231, "grad_norm": 0.01825595274567604, "learning_rate": 1e-06, "loss": -0.0034, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3919.0, "completions/max_terminated_length": 3919.0, "completions/mean_length": 1301.556640625, "completions/mean_terminated_length": 1301.556640625, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.22784828394651413, "epoch": 1.4246794871794872, "frac_reward_zero_std": 0.09375, "grad_norm": 0.03417736664414406, "learning_rate": 1e-06, "loss": 0.0058, "num_tokens": 756822881.0, "reward": 0.27989476919174194, "reward_std": 0.10528245568275452, "rewards/progression_diversity/mean": -0.0002708068350329995, "rewards/progression_diversity/std": 0.0028580420184880495, "rewards/symbolic_reward_accuracy/mean": 0.13671875, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.6595540046691895, "rewards/symbolic_reward_partial_score/std": 0.209646537899971, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0472159385681152, "sampling/importance_sampling_ratio/min": 9.738879924725552e-09, "sampling/sampling_logp_difference/max": 18.447139739990234, "sampling/sampling_logp_difference/mean": 0.09813030064105988, "step": 889 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23206569254398346, "epoch": 1.4262820512820513, "grad_norm": 0.02267414890229702, "learning_rate": 1e-06, "loss": 0.0077, "step": 890 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.23299744725227356, "epoch": 1.4278846153846154, "grad_norm": 0.02226838655769825, "learning_rate": 1e-06, "loss": -0.0129, "step": 891 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23174694180488586, "epoch": 1.4294871794871795, "grad_norm": 0.02325165644288063, "learning_rate": 1e-06, "loss": 0.0171, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3854.0, "completions/max_terminated_length": 3854.0, "completions/mean_length": 1381.82421875, "completions/mean_terminated_length": 1381.82421875, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.23948068916797638, "epoch": 1.4310897435897436, "frac_reward_zero_std": 0.125, "grad_norm": 0.02791544795036316, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 758422935.0, "reward": 0.23517410457134247, "reward_std": 0.05936092138290405, "rewards/progression_diversity/mean": -0.00016881769988685846, "rewards/progression_diversity/std": 0.0023643465247005224, "rewards/symbolic_reward_accuracy/mean": 0.08984375, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.6042318344116211, "rewards/symbolic_reward_partial_score/std": 0.19526702165603638, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0488243103027344, "sampling/importance_sampling_ratio/min": 0.00036482111318036914, "sampling/sampling_logp_difference/max": 7.916103363037109, "sampling/sampling_logp_difference/mean": 0.10077446699142456, "step": 893 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2307603806257248, "epoch": 1.4326923076923077, "grad_norm": 0.029786186292767525, "learning_rate": 1e-06, "loss": 0.0158, "step": 894 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2346210777759552, "epoch": 1.4342948717948718, "grad_norm": 0.030796803534030914, "learning_rate": 1e-06, "loss": -0.0004, "step": 895 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.24034950882196426, "epoch": 1.435897435897436, "grad_norm": 0.01626007631421089, "learning_rate": 1e-06, "loss": -0.0021, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4579.0, "completions/max_terminated_length": 4579.0, "completions/mean_length": 1346.220703125, "completions/mean_terminated_length": 1346.220703125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.2430427074432373, "epoch": 1.4375, "frac_reward_zero_std": 0.0625, "grad_norm": 0.030239736661314964, "learning_rate": 1e-06, "loss": 0.0065, "num_tokens": 759944376.0, "reward": 0.2594153881072998, "reward_std": 0.09177584946155548, "rewards/progression_diversity/mean": -0.0003554845170583576, "rewards/progression_diversity/std": 0.004576821345835924, "rewards/symbolic_reward_accuracy/mean": 0.119140625, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.6264485120773315, "rewards/symbolic_reward_partial_score/std": 0.23991042375564575, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0497946739196777, "sampling/importance_sampling_ratio/min": 0.003207254223525524, "sampling/sampling_logp_difference/max": 5.742340087890625, "sampling/sampling_logp_difference/mean": 0.1022864505648613, "step": 897 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.24024366587400436, "epoch": 1.439102564102564, "grad_norm": 0.02961251139640808, "learning_rate": 1e-06, "loss": 0.0027, "step": 898 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.24130822718143463, "epoch": 1.4407051282051282, "grad_norm": 0.02147165685892105, "learning_rate": 1e-06, "loss": 0.0061, "step": 899 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2432633340358734, "epoch": 1.4423076923076923, "grad_norm": 0.027493350207805634, "learning_rate": 1e-06, "loss": 0.0033, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4780.0, "completions/max_terminated_length": 4780.0, "completions/mean_length": 1268.705078125, "completions/mean_terminated_length": 1268.705078125, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "entropy": 0.24526084959506989, "epoch": 1.4439102564102564, "frac_reward_zero_std": 0.03125, "grad_norm": 0.02393464371562004, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 761386065.0, "reward": 0.23188182711601257, "reward_std": 0.0820198506116867, "rewards/progression_diversity/mean": -0.0002951324568130076, "rewards/progression_diversity/std": 0.0035472013987600803, "rewards/symbolic_reward_accuracy/mean": 0.05078125, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.67138671875, "rewards/symbolic_reward_partial_score/std": 0.20371752977371216, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0510923862457275, "sampling/importance_sampling_ratio/min": 4.374550189822912e-05, "sampling/sampling_logp_difference/max": 10.037121772766113, "sampling/sampling_logp_difference/mean": 0.103520967066288, "step": 901 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.24283086508512497, "epoch": 1.4455128205128205, "grad_norm": 0.02904311753809452, "learning_rate": 1e-06, "loss": 0.0, "step": 902 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.24352674186229706, "epoch": 1.4471153846153846, "grad_norm": 0.01613425463438034, "learning_rate": 1e-06, "loss": 0.0014, "step": 903 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.25162430852651596, "epoch": 1.4487179487179487, "grad_norm": 0.02424633502960205, "learning_rate": 1e-06, "loss": 0.0034, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4364.0, "completions/max_terminated_length": 4364.0, "completions/mean_length": 1284.42578125, "completions/mean_terminated_length": 1284.42578125, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.24817583709955215, "epoch": 1.4503205128205128, "frac_reward_zero_std": 0.21875, "grad_norm": 0.025694923475384712, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 762867995.0, "reward": 0.30710139870643616, "reward_std": 0.05024458467960358, "rewards/progression_diversity/mean": -0.0003102348418906331, "rewards/progression_diversity/std": 0.003935862332582474, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.656494140625, "rewards/symbolic_reward_partial_score/std": 0.2331247478723526, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0509289503097534, "sampling/importance_sampling_ratio/min": 0.002624225802719593, "sampling/sampling_logp_difference/max": 5.94296932220459, "sampling/sampling_logp_difference/mean": 0.10329769551753998, "step": 905 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.24394653737545013, "epoch": 1.4519230769230769, "grad_norm": 0.020293056964874268, "learning_rate": 1e-06, "loss": -0.0066, "step": 906 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.24355324357748032, "epoch": 1.453525641025641, "grad_norm": 0.01819128543138504, "learning_rate": 1e-06, "loss": 0.0068, "step": 907 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.241835318505764, "epoch": 1.455128205128205, "grad_norm": 0.01816965639591217, "learning_rate": 1e-06, "loss": -0.0012, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3849.0, "completions/max_terminated_length": 3849.0, "completions/mean_length": 1251.197265625, "completions/mean_terminated_length": 1251.197265625, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.24188891798257828, "epoch": 1.4567307692307692, "frac_reward_zero_std": 0.09375, "grad_norm": 0.02820868045091629, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 764313808.0, "reward": 0.31225496530532837, "reward_std": 0.08842569589614868, "rewards/progression_diversity/mean": -9.080560994334519e-05, "rewards/progression_diversity/std": 0.001025109551846981, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.6658528447151184, "rewards/symbolic_reward_partial_score/std": 0.24876438081264496, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0511972904205322, "sampling/importance_sampling_ratio/min": 0.0021847328171133995, "sampling/sampling_logp_difference/max": 6.1262617111206055, "sampling/sampling_logp_difference/mean": 0.1035478338599205, "step": 909 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.24276289343833923, "epoch": 1.4583333333333333, "grad_norm": 0.023914597928524017, "learning_rate": 1e-06, "loss": -0.0036, "step": 910 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2428332194685936, "epoch": 1.4599358974358974, "grad_norm": 0.018325267359614372, "learning_rate": 1e-06, "loss": 0.0122, "step": 911 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.24748840928077698, "epoch": 1.4615384615384617, "grad_norm": 0.021704666316509247, "learning_rate": 1e-06, "loss": -0.0035, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3406.0, "completions/max_terminated_length": 3406.0, "completions/mean_length": 1280.796875, "completions/mean_terminated_length": 1280.796875, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.24230319261550903, "epoch": 1.4631410256410255, "frac_reward_zero_std": 0.15625, "grad_norm": 0.026809457689523697, "learning_rate": 1e-06, "loss": -0.0139, "num_tokens": 765825992.0, "reward": 0.29785943031311035, "reward_std": 0.1095178872346878, "rewards/progression_diversity/mean": -0.00018848836771212518, "rewards/progression_diversity/std": 0.0023818761110305786, "rewards/symbolic_reward_accuracy/mean": 0.169921875, "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, "rewards/symbolic_reward_partial_score/mean": 0.653027355670929, "rewards/symbolic_reward_partial_score/std": 0.24102628231048584, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0500684976577759, "sampling/importance_sampling_ratio/min": 0.00195193977560848, "sampling/sampling_logp_difference/max": 6.238931655883789, "sampling/sampling_logp_difference/mean": 0.10224372148513794, "step": 913 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.24049583077430725, "epoch": 1.4647435897435899, "grad_norm": 0.019987255334854126, "learning_rate": 1e-06, "loss": 0.0159, "step": 914 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.23697714507579803, "epoch": 1.4663461538461537, "grad_norm": 0.014981388114392757, "learning_rate": 1e-06, "loss": 0.0016, "step": 915 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.23524734377861023, "epoch": 1.467948717948718, "grad_norm": 0.019518760964274406, "learning_rate": 1e-06, "loss": 0.0036, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3382.0, "completions/max_terminated_length": 3382.0, "completions/mean_length": 1324.177734375, "completions/mean_terminated_length": 1324.177734375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "entropy": 0.23860549181699753, "epoch": 1.469551282051282, "frac_reward_zero_std": 0.09375, "grad_norm": 0.034368306398391724, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 767335715.0, "reward": 0.2800864577293396, "reward_std": 0.11742740869522095, "rewards/progression_diversity/mean": -0.0001446699898224324, "rewards/progression_diversity/std": 0.0014527636812999845, "rewards/symbolic_reward_accuracy/mean": 0.146484375, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.6406575441360474, "rewards/symbolic_reward_partial_score/std": 0.218194380402565, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0496015548706055, "sampling/importance_sampling_ratio/min": 2.004770264726896e-12, "sampling/sampling_logp_difference/max": 26.93549156188965, "sampling/sampling_logp_difference/mean": 0.10103441774845123, "step": 917 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2368185818195343, "epoch": 1.4711538461538463, "grad_norm": 0.018526000902056694, "learning_rate": 1e-06, "loss": -0.0033, "step": 918 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.24254272133111954, "epoch": 1.4727564102564101, "grad_norm": 0.021521111950278282, "learning_rate": 1e-06, "loss": 0.0077, "step": 919 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.24013221263885498, "epoch": 1.4743589743589745, "grad_norm": 0.019323555752635002, "learning_rate": 1e-06, "loss": 0.0033, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3175.0, "completions/max_terminated_length": 3175.0, "completions/mean_length": 1178.45703125, "completions/mean_terminated_length": 1178.45703125, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 0.24065189808607101, "epoch": 1.4759615384615383, "frac_reward_zero_std": 0.09375, "grad_norm": 0.027268243953585625, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 768849229.0, "reward": 0.2435724139213562, "reward_std": 0.0985773503780365, "rewards/progression_diversity/mean": -0.00018066662596538663, "rewards/progression_diversity/std": 0.0020558543037623167, "rewards/symbolic_reward_accuracy/mean": 0.103515625, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.6048828363418579, "rewards/symbolic_reward_partial_score/std": 0.21721047163009644, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0502285957336426, "sampling/importance_sampling_ratio/min": 0.0018710603471845388, "sampling/sampling_logp_difference/max": 6.28125, "sampling/sampling_logp_difference/mean": 0.1014840379357338, "step": 921 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2395830601453781, "epoch": 1.4775641025641026, "grad_norm": 0.017518319189548492, "learning_rate": 1e-06, "loss": 0.0038, "step": 922 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.23873654007911682, "epoch": 1.4791666666666667, "grad_norm": 0.015559080988168716, "learning_rate": 1e-06, "loss": 0.01, "step": 923 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2355535551905632, "epoch": 1.4807692307692308, "grad_norm": 0.019503874704241753, "learning_rate": 1e-06, "loss": 0.003, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3569.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 1200.4765625, "completions/mean_terminated_length": 1200.4765625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.2354103922843933, "epoch": 1.482371794871795, "frac_reward_zero_std": 0.21875, "grad_norm": 0.025359977036714554, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 770377745.0, "reward": 0.29045236110687256, "reward_std": 0.059886179864406586, "rewards/progression_diversity/mean": -0.00017451572057325393, "rewards/progression_diversity/std": 0.002117466414347291, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.6556802988052368, "rewards/symbolic_reward_partial_score/std": 0.22488947212696075, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0493230819702148, "sampling/importance_sampling_ratio/min": 0.0001748588983900845, "sampling/sampling_logp_difference/max": 8.651531219482422, "sampling/sampling_logp_difference/mean": 0.10150042176246643, "step": 925 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.23772598803043365, "epoch": 1.483974358974359, "grad_norm": 0.019928786903619766, "learning_rate": 1e-06, "loss": 0.0116, "step": 926 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.23272281885147095, "epoch": 1.4855769230769231, "grad_norm": 0.020350804552435875, "learning_rate": 1e-06, "loss": 0.0017, "step": 927 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2364245057106018, "epoch": 1.4871794871794872, "grad_norm": 0.023986948654055595, "learning_rate": 1e-06, "loss": -0.0081, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4100.0, "completions/mean_length": 1294.845703125, "completions/mean_terminated_length": 1235.672607421875, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "entropy": 0.23016077280044556, "epoch": 1.4887820512820513, "frac_reward_zero_std": 0.09375, "grad_norm": 171.7614288330078, "learning_rate": 1e-06, "loss": 0.0327, "num_tokens": 771956514.0, "reward": 0.22332197427749634, "reward_std": 0.07956110686063766, "rewards/progression_diversity/mean": -0.0012982608750462532, "rewards/progression_diversity/std": 0.026022804901003838, "rewards/symbolic_reward_accuracy/mean": 0.05859375, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.6285644769668579, "rewards/symbolic_reward_partial_score/std": 0.20716184377670288, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0451966524124146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 1.1087749004364014, "step": 929 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2362649217247963, "epoch": 1.4903846153846154, "grad_norm": 0.02201710268855095, "learning_rate": 1e-06, "loss": -0.0087, "step": 930 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.22993260622024536, "epoch": 1.4919871794871795, "grad_norm": 0.020292161032557487, "learning_rate": 1e-06, "loss": 0.0475, "step": 931 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.230661079287529, "epoch": 1.4935897435897436, "grad_norm": 0.021966397762298584, "learning_rate": 1e-06, "loss": -0.0086, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3575.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 1369.162109375, "completions/mean_terminated_length": 1369.162109375, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "entropy": 0.23234926909208298, "epoch": 1.4951923076923077, "frac_reward_zero_std": 0.03125, "grad_norm": 0.03726167976856232, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 773450389.0, "reward": 0.29813235998153687, "reward_std": 0.12365701794624329, "rewards/progression_diversity/mean": -0.0002430165041005239, "rewards/progression_diversity/std": 0.00278676301240921, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.6461262702941895, "rewards/symbolic_reward_partial_score/std": 0.24039986729621887, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0482527017593384, "sampling/importance_sampling_ratio/min": 5.24304415801171e-15, "sampling/sampling_logp_difference/max": 32.881874084472656, "sampling/sampling_logp_difference/mean": 0.10041865706443787, "step": 933 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.23114058375358582, "epoch": 1.4967948717948718, "grad_norm": 0.02173478528857231, "learning_rate": 1e-06, "loss": -0.0046, "step": 934 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2359921634197235, "epoch": 1.498397435897436, "grad_norm": 0.01734345592558384, "learning_rate": 1e-06, "loss": 0.009, "step": 935 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2320852428674698, "epoch": 1.5, "grad_norm": 0.025080712512135506, "learning_rate": 1e-06, "loss": 0.0043, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3237.0, "completions/max_terminated_length": 3237.0, "completions/mean_length": 1222.326171875, "completions/mean_terminated_length": 1222.326171875, "completions/min_length": 378.0, "completions/min_terminated_length": 378.0, "entropy": 0.23251069337129593, "epoch": 1.501602564102564, "frac_reward_zero_std": 0.0625, "grad_norm": 0.03220761939883232, "learning_rate": 1e-06, "loss": -0.0103, "num_tokens": 774980300.0, "reward": 0.26821303367614746, "reward_std": 0.10178233683109283, "rewards/progression_diversity/mean": -0.00047397619346156716, "rewards/progression_diversity/std": 0.004046800546348095, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.6284343004226685, "rewards/symbolic_reward_partial_score/std": 0.20864459872245789, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0485594272613525, "sampling/importance_sampling_ratio/min": 0.0010224799625575542, "sampling/sampling_logp_difference/max": 6.885524272918701, "sampling/sampling_logp_difference/mean": 0.10064580291509628, "step": 937 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.23433350771665573, "epoch": 1.5032051282051282, "grad_norm": 0.027498142793774605, "learning_rate": 1e-06, "loss": -0.0059, "step": 938 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23485945910215378, "epoch": 1.5048076923076923, "grad_norm": 0.022965963929891586, "learning_rate": 1e-06, "loss": 0.0029, "step": 939 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2323073446750641, "epoch": 1.5064102564102564, "grad_norm": 0.017642127349972725, "learning_rate": 1e-06, "loss": 0.011, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3077.0, "completions/max_terminated_length": 3077.0, "completions/mean_length": 1306.53515625, "completions/mean_terminated_length": 1306.53515625, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "entropy": 0.23624175786972046, "epoch": 1.5080128205128205, "frac_reward_zero_std": 0.125, "grad_norm": 0.032414428889751434, "learning_rate": 1e-06, "loss": -0.0088, "num_tokens": 776444462.0, "reward": 0.35686036944389343, "reward_std": 0.1280536651611328, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.240234375, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.7090657949447632, "rewards/symbolic_reward_partial_score/std": 0.2242516130208969, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0489134788513184, "sampling/importance_sampling_ratio/min": 0.0029864604584872723, "sampling/sampling_logp_difference/max": 5.813666343688965, "sampling/sampling_logp_difference/mean": 0.10092984884977341, "step": 941 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2372562438249588, "epoch": 1.5096153846153846, "grad_norm": 0.0257510244846344, "learning_rate": 1e-06, "loss": 0.014, "step": 942 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.23343972861766815, "epoch": 1.5112179487179487, "grad_norm": 0.018904410302639008, "learning_rate": 1e-06, "loss": 0.0061, "step": 943 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.23894614726305008, "epoch": 1.5128205128205128, "grad_norm": 0.023683864623308182, "learning_rate": 1e-06, "loss": 0.0012, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3415.0, "completions/mean_length": 1307.0390625, "completions/mean_terminated_length": 1277.5341796875, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "entropy": 0.23835492879152298, "epoch": 1.5144230769230769, "frac_reward_zero_std": 0.125, "grad_norm": 0.02669195644557476, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 777998866.0, "reward": 0.27208733558654785, "reward_std": 0.10921524465084076, "rewards/progression_diversity/mean": -0.0007387942750938237, "rewards/progression_diversity/std": 0.014222451485693455, "rewards/symbolic_reward_accuracy/mean": 0.13671875, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.6341959834098816, "rewards/symbolic_reward_partial_score/std": 0.22311989963054657, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0479480028152466, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.576566219329834, "step": 945 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23516834527254105, "epoch": 1.516025641025641, "grad_norm": 0.02226245403289795, "learning_rate": 1e-06, "loss": 0.017, "step": 946 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.23766741156578064, "epoch": 1.5176282051282053, "grad_norm": 0.018796978518366814, "learning_rate": 1e-06, "loss": 0.0079, "step": 947 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.24504876136779785, "epoch": 1.5192307692307692, "grad_norm": 0.019867926836013794, "learning_rate": 1e-06, "loss": 0.0013, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3256.0, "completions/max_terminated_length": 3256.0, "completions/mean_length": 1339.611328125, "completions/mean_terminated_length": 1339.611328125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.24304132908582687, "epoch": 1.5208333333333335, "frac_reward_zero_std": 0.09375, "grad_norm": 0.0299557913094759, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 779544475.0, "reward": 0.23521313071250916, "reward_std": 0.07730136811733246, "rewards/progression_diversity/mean": -0.00017136070528067648, "rewards/progression_diversity/std": 0.0020358404144644737, "rewards/symbolic_reward_accuracy/mean": 0.07421875, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.6356120109558105, "rewards/symbolic_reward_partial_score/std": 0.2210559844970703, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0506515502929688, "sampling/importance_sampling_ratio/min": 7.503909728256986e-05, "sampling/sampling_logp_difference/max": 9.497501373291016, "sampling/sampling_logp_difference/mean": 0.10373318195343018, "step": 949 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.24009553343057632, "epoch": 1.5224358974358974, "grad_norm": 0.01935042440891266, "learning_rate": 1e-06, "loss": -0.0066, "step": 950 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2450866997241974, "epoch": 1.5240384615384617, "grad_norm": 0.02386186644434929, "learning_rate": 1e-06, "loss": 0.0068, "step": 951 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.24486373364925385, "epoch": 1.5256410256410255, "grad_norm": 0.026845736429095268, "learning_rate": 1e-06, "loss": -0.0011, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3384.0, "completions/mean_length": 1382.8046875, "completions/mean_terminated_length": 1353.4481201171875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "entropy": 0.24295812845230103, "epoch": 1.5272435897435899, "frac_reward_zero_std": 0.25, "grad_norm": 0.022905530408024788, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 781073095.0, "reward": 0.2498174011707306, "reward_std": 0.08474698662757874, "rewards/progression_diversity/mean": -0.00019435372087173164, "rewards/progression_diversity/std": 0.0025657941587269306, "rewards/symbolic_reward_accuracy/mean": 0.10546875, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.6224446296691895, "rewards/symbolic_reward_partial_score/std": 0.231918066740036, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049942970275879, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.46404212713241577, "step": 953 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.24674228578805923, "epoch": 1.5288461538461537, "grad_norm": 0.01813218928873539, "learning_rate": 1e-06, "loss": -0.0004, "step": 954 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2422228530049324, "epoch": 1.530448717948718, "grad_norm": 1672.424560546875, "learning_rate": 1e-06, "loss": 0.0588, "step": 955 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2425452098250389, "epoch": 1.532051282051282, "grad_norm": 0.017666012048721313, "learning_rate": 1e-06, "loss": -0.0026, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3996.0, "completions/max_terminated_length": 3996.0, "completions/mean_length": 1283.716796875, "completions/mean_terminated_length": 1283.716796875, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.2416147142648697, "epoch": 1.5336538461538463, "frac_reward_zero_std": 0.1875, "grad_norm": 0.022795159369707108, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 782631542.0, "reward": 0.29501813650131226, "reward_std": 0.07359684258699417, "rewards/progression_diversity/mean": -0.00013884622603654861, "rewards/progression_diversity/std": 0.0017234663246199489, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.6748046875, "rewards/symbolic_reward_partial_score/std": 0.21226514875888824, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0523028373718262, "sampling/importance_sampling_ratio/min": 1.8659156353351136e-07, "sampling/sampling_logp_difference/max": 15.494343757629395, "sampling/sampling_logp_difference/mean": 0.10692096501588821, "step": 957 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.24522508680820465, "epoch": 1.5352564102564101, "grad_norm": 0.01961701549589634, "learning_rate": 1e-06, "loss": 0.004, "step": 958 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.25065000355243683, "epoch": 1.5368589743589745, "grad_norm": 0.020117729902267456, "learning_rate": 1e-06, "loss": -0.0044, "step": 959 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.25352972745895386, "epoch": 1.5384615384615383, "grad_norm": 0.019687386229634285, "learning_rate": 1e-06, "loss": -0.0013, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3396.0, "completions/mean_length": 1496.544921875, "completions/mean_terminated_length": 1467.410888671875, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.23719095438718796, "epoch": 1.5400641025641026, "frac_reward_zero_std": 0.15625, "grad_norm": 0.03074975125491619, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 784215197.0, "reward": 0.3043617606163025, "reward_std": 0.09115153551101685, "rewards/progression_diversity/mean": -0.0003482637694105506, "rewards/progression_diversity/std": 0.006979175843298435, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.6395508050918579, "rewards/symbolic_reward_partial_score/std": 0.25730493664741516, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0488170385360718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.163615345954895, "step": 961 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2416258454322815, "epoch": 1.5416666666666665, "grad_norm": 0.02943342924118042, "learning_rate": 1e-06, "loss": 0.0171, "step": 962 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.24121008068323135, "epoch": 1.5432692307692308, "grad_norm": 0.022504808381199837, "learning_rate": 1e-06, "loss": -0.0017, "step": 963 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.23982763290405273, "epoch": 1.5448717948717947, "grad_norm": 0.026652559638023376, "learning_rate": 1e-06, "loss": -0.0056, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3516.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 1295.953125, "completions/mean_terminated_length": 1295.953125, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.2383585274219513, "epoch": 1.546474358974359, "frac_reward_zero_std": 0.1875, "grad_norm": 0.03317641466856003, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 785827077.0, "reward": 0.2608430087566376, "reward_std": 0.09105145186185837, "rewards/progression_diversity/mean": -0.00017204870528075844, "rewards/progression_diversity/std": 0.002580570289865136, "rewards/symbolic_reward_accuracy/mean": 0.091796875, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.6858886480331421, "rewards/symbolic_reward_partial_score/std": 0.18577834963798523, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0511445999145508, "sampling/importance_sampling_ratio/min": 2.0922935606293436e-10, "sampling/sampling_logp_difference/max": 22.28759002685547, "sampling/sampling_logp_difference/mean": 0.10452830791473389, "step": 965 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2417585551738739, "epoch": 1.5480769230769231, "grad_norm": 0.015128690749406815, "learning_rate": 1e-06, "loss": 0.0021, "step": 966 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24052689224481583, "epoch": 1.5496794871794872, "grad_norm": 0.016625335440039635, "learning_rate": 1e-06, "loss": 0.0104, "step": 967 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.23810868710279465, "epoch": 1.5512820512820513, "grad_norm": 0.024602653458714485, "learning_rate": 1e-06, "loss": -0.0002, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3115.0, "completions/max_terminated_length": 3115.0, "completions/mean_length": 1370.482421875, "completions/mean_terminated_length": 1370.482421875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.24552591890096664, "epoch": 1.5528846153846154, "frac_reward_zero_std": 0.125, "grad_norm": 0.030604401603341103, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 787405868.0, "reward": 0.22608765959739685, "reward_std": 0.0656827911734581, "rewards/progression_diversity/mean": -0.00012242203229106963, "rewards/progression_diversity/std": 0.0018643095390871167, "rewards/symbolic_reward_accuracy/mean": 0.060546875, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.6325358152389526, "rewards/symbolic_reward_partial_score/std": 0.17896120250225067, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.051476240158081, "sampling/importance_sampling_ratio/min": 0.003860935801640153, "sampling/sampling_logp_difference/max": 5.556845664978027, "sampling/sampling_logp_difference/mean": 0.10471828281879425, "step": 969 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.24233480542898178, "epoch": 1.5544871794871795, "grad_norm": 0.017392568290233612, "learning_rate": 1e-06, "loss": 0.0187, "step": 970 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2429162785410881, "epoch": 1.5560897435897436, "grad_norm": 0.02391919493675232, "learning_rate": 1e-06, "loss": -0.0042, "step": 971 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.24405669420957565, "epoch": 1.5576923076923077, "grad_norm": 0.020459329709410667, "learning_rate": 1e-06, "loss": 0.0036, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3250.0, "completions/max_terminated_length": 3250.0, "completions/mean_length": 1407.92578125, "completions/mean_terminated_length": 1407.92578125, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.2465820536017418, "epoch": 1.5592948717948718, "frac_reward_zero_std": 0.125, "grad_norm": 0.02617330104112625, "learning_rate": 1e-06, "loss": 0.0132, "num_tokens": 788955078.0, "reward": 0.2446797788143158, "reward_std": 0.07382390648126602, "rewards/progression_diversity/mean": -0.00028444104827940464, "rewards/progression_diversity/std": 0.002962089842185378, "rewards/symbolic_reward_accuracy/mean": 0.078125, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.6593587398529053, "rewards/symbolic_reward_partial_score/std": 0.18929307162761688, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0518171787261963, "sampling/importance_sampling_ratio/min": 7.697512046433985e-06, "sampling/sampling_logp_difference/max": 11.774613380432129, "sampling/sampling_logp_difference/mean": 0.10526497662067413, "step": 973 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.24588342010974884, "epoch": 1.560897435897436, "grad_norm": 0.022333713248372078, "learning_rate": 1e-06, "loss": 0.0055, "step": 974 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.24611911922693253, "epoch": 1.5625, "grad_norm": 0.01700982078909874, "learning_rate": 1e-06, "loss": 0.0166, "step": 975 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2488018348813057, "epoch": 1.564102564102564, "grad_norm": 0.01537299808114767, "learning_rate": 1e-06, "loss": -0.0055, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3609.0, "completions/max_terminated_length": 3609.0, "completions/mean_length": 1228.7265625, "completions/mean_terminated_length": 1228.7265625, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "entropy": 0.2481737807393074, "epoch": 1.5657051282051282, "frac_reward_zero_std": 0.1875, "grad_norm": 0.03226266801357269, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 790546554.0, "reward": 0.19851306080818176, "reward_std": 0.0660344660282135, "rewards/progression_diversity/mean": -0.0002565660688560456, "rewards/progression_diversity/std": 0.0029936530627310276, "rewards/symbolic_reward_accuracy/mean": 0.044921875, "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, "rewards/symbolic_reward_partial_score/mean": 0.5718749761581421, "rewards/symbolic_reward_partial_score/std": 0.20530447363853455, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0529671907424927, "sampling/importance_sampling_ratio/min": 0.0003155683516524732, "sampling/sampling_logp_difference/max": 8.061135292053223, "sampling/sampling_logp_difference/mean": 0.10595399141311646, "step": 977 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.24593167752027512, "epoch": 1.5673076923076923, "grad_norm": 0.017432967200875282, "learning_rate": 1e-06, "loss": -0.0049, "step": 978 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.24341944605112076, "epoch": 1.5689102564102564, "grad_norm": 0.01746542379260063, "learning_rate": 1e-06, "loss": 0.0044, "step": 979 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.24650365114212036, "epoch": 1.5705128205128205, "grad_norm": 0.015472476370632648, "learning_rate": 1e-06, "loss": -0.003, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3315.0, "completions/max_terminated_length": 3315.0, "completions/mean_length": 1509.9140625, "completions/mean_terminated_length": 1509.9140625, "completions/min_length": 491.0, "completions/min_terminated_length": 491.0, "entropy": 0.2508416026830673, "epoch": 1.5721153846153846, "frac_reward_zero_std": 0.09375, "grad_norm": 0.025885384529829025, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 792077166.0, "reward": 0.3290412425994873, "reward_std": 0.11779160797595978, "rewards/progression_diversity/mean": -0.00017644937906879932, "rewards/progression_diversity/std": 0.0018998431041836739, "rewards/symbolic_reward_accuracy/mean": 0.208984375, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.6788411140441895, "rewards/symbolic_reward_partial_score/std": 0.22582782804965973, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0522162914276123, "sampling/importance_sampling_ratio/min": 0.00034631905145943165, "sampling/sampling_logp_difference/max": 7.9681501388549805, "sampling/sampling_logp_difference/mean": 0.10585089027881622, "step": 981 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.24679741263389587, "epoch": 1.5737179487179487, "grad_norm": 0.025917481631040573, "learning_rate": 1e-06, "loss": -0.0061, "step": 982 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.24447974562644958, "epoch": 1.5753205128205128, "grad_norm": 0.020733220502734184, "learning_rate": 1e-06, "loss": -0.0044, "step": 983 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2511764466762543, "epoch": 1.5769230769230769, "grad_norm": 0.022386424243450165, "learning_rate": 1e-06, "loss": 0.0115, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3325.0, "completions/max_terminated_length": 3325.0, "completions/mean_length": 1401.8359375, "completions/mean_terminated_length": 1401.8359375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.25129084289073944, "epoch": 1.578525641025641, "frac_reward_zero_std": 0.125, "grad_norm": 0.03222787380218506, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 793645338.0, "reward": 0.23190167546272278, "reward_std": 0.05650953948497772, "rewards/progression_diversity/mean": -0.0002622118918225169, "rewards/progression_diversity/std": 0.0023251688107848167, "rewards/symbolic_reward_accuracy/mean": 0.0703125, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.6323893666267395, "rewards/symbolic_reward_partial_score/std": 0.19433899223804474, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0537452697753906, "sampling/importance_sampling_ratio/min": 2.0533148514800814e-08, "sampling/sampling_logp_difference/max": 17.70122528076172, "sampling/sampling_logp_difference/mean": 0.10701755434274673, "step": 985 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2517654523253441, "epoch": 1.5801282051282053, "grad_norm": 0.023150594905018806, "learning_rate": 1e-06, "loss": -0.0046, "step": 986 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2524363100528717, "epoch": 1.5817307692307692, "grad_norm": 0.017600167542696, "learning_rate": 1e-06, "loss": 0.0146, "step": 987 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2514464929699898, "epoch": 1.5833333333333335, "grad_norm": 0.014767357148230076, "learning_rate": 1e-06, "loss": 0.0002, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3305.0, "completions/mean_length": 1364.2421875, "completions/mean_terminated_length": 1334.849365234375, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.24437803030014038, "epoch": 1.5849358974358974, "frac_reward_zero_std": 0.3125, "grad_norm": 0.026894228532910347, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 795228758.0, "reward": 0.25398755073547363, "reward_std": 0.0541648268699646, "rewards/progression_diversity/mean": -0.00017180124996230006, "rewards/progression_diversity/std": 0.0016380609013140202, "rewards/symbolic_reward_accuracy/mean": 0.095703125, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.6558756828308105, "rewards/symbolic_reward_partial_score/std": 0.18879389762878418, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0525712966918945, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 0.19978845119476318, "step": 989 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24415334314107895, "epoch": 1.5865384615384617, "grad_norm": 0.02484998293220997, "learning_rate": 1e-06, "loss": -0.0023, "step": 990 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.25222089886665344, "epoch": 1.5881410256410255, "grad_norm": 0.01625809818506241, "learning_rate": 1e-06, "loss": 0.004, "step": 991 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2524196058511734, "epoch": 1.5897435897435899, "grad_norm": 0.019578304141759872, "learning_rate": 1e-06, "loss": 0.003, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3574.0, "completions/max_terminated_length": 3574.0, "completions/mean_length": 1432.29296875, "completions/mean_terminated_length": 1432.29296875, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.25225862860679626, "epoch": 1.5913461538461537, "frac_reward_zero_std": 0.15625, "grad_norm": 0.02162688970565796, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 796763980.0, "reward": 0.2408090978860855, "reward_std": 0.06670250743627548, "rewards/progression_diversity/mean": -0.00014496369112748653, "rewards/progression_diversity/std": 0.0017526200972497463, "rewards/symbolic_reward_accuracy/mean": 0.080078125, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.6425455808639526, "rewards/symbolic_reward_partial_score/std": 0.1895546019077301, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0530476570129395, "sampling/importance_sampling_ratio/min": 0.003646494820713997, "sampling/sampling_logp_difference/max": 5.613988876342773, "sampling/sampling_logp_difference/mean": 0.10609988868236542, "step": 993 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2456853985786438, "epoch": 1.592948717948718, "grad_norm": 0.019648630172014236, "learning_rate": 1e-06, "loss": -0.0031, "step": 994 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2553592026233673, "epoch": 1.594551282051282, "grad_norm": 0.025570129975676537, "learning_rate": 1e-06, "loss": 0.0049, "step": 995 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24990757554769516, "epoch": 1.5961538461538463, "grad_norm": 0.04156876355409622, "learning_rate": 1e-06, "loss": 0.0087, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3606.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 1444.166015625, "completions/mean_terminated_length": 1444.166015625, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "entropy": 0.2480572983622551, "epoch": 1.5977564102564101, "frac_reward_zero_std": 0.0, "grad_norm": 0.029209570959210396, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 798317441.0, "reward": 0.2392357736825943, "reward_std": 0.06654238700866699, "rewards/progression_diversity/mean": -0.0002508866600692272, "rewards/progression_diversity/std": 0.0026269752997905016, "rewards/symbolic_reward_accuracy/mean": 0.091796875, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.6138671636581421, "rewards/symbolic_reward_partial_score/std": 0.2098008245229721, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0531139373779297, "sampling/importance_sampling_ratio/min": 0.003923820797353983, "sampling/sampling_logp_difference/max": 5.540689468383789, "sampling/sampling_logp_difference/mean": 0.10546484589576721, "step": 997 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.24528241902589798, "epoch": 1.5993589743589745, "grad_norm": 0.02387697622179985, "learning_rate": 1e-06, "loss": 0.0125, "step": 998 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.24644892662763596, "epoch": 1.6009615384615383, "grad_norm": 0.023926518857479095, "learning_rate": 1e-06, "loss": 0.0259, "step": 999 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.24555939435958862, "epoch": 1.6025641025641026, "grad_norm": 0.02301941066980362, "learning_rate": 1e-06, "loss": -0.0124, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3619.0, "completions/mean_length": 1375.71875, "completions/mean_terminated_length": 1346.3482666015625, "completions/min_length": 456.0, "completions/min_terminated_length": 456.0, "entropy": 0.23569654673337936, "epoch": 1.6041666666666665, "frac_reward_zero_std": 0.125, "grad_norm": 0.03722744435071945, "learning_rate": 1e-06, "loss": -0.0089, "num_tokens": 800005953.0, "reward": 0.23674920201301575, "reward_std": 0.0885818749666214, "rewards/progression_diversity/mean": -0.0008616495179012418, "rewards/progression_diversity/std": 0.013127473182976246, "rewards/symbolic_reward_accuracy/mean": 0.095703125, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.598437488079071, "rewards/symbolic_reward_partial_score/std": 0.22310422360897064, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0491344928741455, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 1.0696396827697754, "step": 1001 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2391332983970642, "epoch": 1.6057692307692308, "grad_norm": 0.023793857544660568, "learning_rate": 1e-06, "loss": 0.0327, "step": 1002 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2433808520436287, "epoch": 1.6073717948717947, "grad_norm": 0.01809019409120083, "learning_rate": 1e-06, "loss": 0.0041, "step": 1003 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23954389244318008, "epoch": 1.608974358974359, "grad_norm": 0.017016278579831123, "learning_rate": 1e-06, "loss": -0.001, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3544.0, "completions/max_terminated_length": 3544.0, "completions/mean_length": 1332.548828125, "completions/mean_terminated_length": 1332.548828125, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "entropy": 0.24541258811950684, "epoch": 1.6105769230769231, "frac_reward_zero_std": 0.0625, "grad_norm": 0.024081578478217125, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 801514282.0, "reward": 0.2921689450740814, "reward_std": 0.07876162230968475, "rewards/progression_diversity/mean": -0.0003911844396498054, "rewards/progression_diversity/std": 0.0043034846894443035, "rewards/symbolic_reward_accuracy/mean": 0.140625, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.6926594972610474, "rewards/symbolic_reward_partial_score/std": 0.20360209047794342, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.052016019821167, "sampling/importance_sampling_ratio/min": 8.38755295262672e-06, "sampling/sampling_logp_difference/max": 11.688761711120605, "sampling/sampling_logp_difference/mean": 0.10484174638986588, "step": 1005 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.23762197047472, "epoch": 1.6121794871794872, "grad_norm": 0.017401108518242836, "learning_rate": 1e-06, "loss": 0.0039, "step": 1006 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.24435139447450638, "epoch": 1.6137820512820513, "grad_norm": 0.02926545962691307, "learning_rate": 1e-06, "loss": 0.0016, "step": 1007 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24484486877918243, "epoch": 1.6153846153846154, "grad_norm": 0.023529132828116417, "learning_rate": 1e-06, "loss": -0.0068, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3515.0, "completions/mean_length": 1400.970703125, "completions/mean_terminated_length": 1371.649658203125, "completions/min_length": 404.0, "completions/min_terminated_length": 404.0, "entropy": 0.23839521408081055, "epoch": 1.6169871794871795, "frac_reward_zero_std": 0.21875, "grad_norm": 0.036472517997026443, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 803169707.0, "reward": 0.23677846789360046, "reward_std": 0.052502475678920746, "rewards/progression_diversity/mean": -0.0003774126525968313, "rewards/progression_diversity/std": 0.0034933658316731453, "rewards/symbolic_reward_accuracy/mean": 0.0703125, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.6493000984191895, "rewards/symbolic_reward_partial_score/std": 0.23349793255329132, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0512499809265137, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 228.0, "sampling/sampling_logp_difference/mean": 0.11092744767665863, "step": 1009 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24148935079574585, "epoch": 1.6185897435897436, "grad_norm": 0.029029127210378647, "learning_rate": 1e-06, "loss": 0.0061, "step": 1010 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.24070608615875244, "epoch": 1.6201923076923077, "grad_norm": 0.020563537254929543, "learning_rate": 1e-06, "loss": 0.009, "step": 1011 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.24124725908041, "epoch": 1.6217948717948718, "grad_norm": 0.016905367374420166, "learning_rate": 1e-06, "loss": 0.0065, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3733.0, "completions/mean_length": 1487.85546875, "completions/mean_terminated_length": 1400.0589599609375, "completions/min_length": 485.0, "completions/min_terminated_length": 485.0, "entropy": 0.2362624704837799, "epoch": 1.623397435897436, "frac_reward_zero_std": 0.0625, "grad_norm": 0.025372762233018875, "learning_rate": 1e-06, "loss": 0.0274, "num_tokens": 804794641.0, "reward": 0.2821791172027588, "reward_std": 0.1007528007030487, "rewards/progression_diversity/mean": -0.0003515832358971238, "rewards/progression_diversity/std": 0.004287549760192633, "rewards/symbolic_reward_accuracy/mean": 0.150390625, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.6417806148529053, "rewards/symbolic_reward_partial_score/std": 0.22223784029483795, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0479587316513062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 0.5552178621292114, "step": 1013 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.237822063267231, "epoch": 1.625, "grad_norm": 0.017621813341975212, "learning_rate": 1e-06, "loss": -0.0144, "step": 1014 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.24151387810707092, "epoch": 1.626602564102564, "grad_norm": 0.01545657031238079, "learning_rate": 1e-06, "loss": -0.0025, "step": 1015 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23863951116800308, "epoch": 1.6282051282051282, "grad_norm": 153.45928955078125, "learning_rate": 1e-06, "loss": 0.0558, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4848.0, "completions/mean_length": 1533.744140625, "completions/mean_terminated_length": 1504.6829833984375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.23886650800704956, "epoch": 1.6298076923076923, "frac_reward_zero_std": 0.125, "grad_norm": 0.03264259546995163, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 806450974.0, "reward": 0.1910867989063263, "reward_std": 0.055806443095207214, "rewards/progression_diversity/mean": -0.0006959763122722507, "rewards/progression_diversity/std": 0.00666604982689023, "rewards/symbolic_reward_accuracy/mean": 0.025390625, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.5868489742279053, "rewards/symbolic_reward_partial_score/std": 0.21801504492759705, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0492140054702759, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.3248835504055023, "step": 1017 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23825782537460327, "epoch": 1.6314102564102564, "grad_norm": 0.02487725391983986, "learning_rate": 1e-06, "loss": 0.0279, "step": 1018 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23552914708852768, "epoch": 1.6330128205128205, "grad_norm": 0.029635349288582802, "learning_rate": 1e-06, "loss": 0.0094, "step": 1019 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2386602759361267, "epoch": 1.6346153846153846, "grad_norm": 0.02634367346763611, "learning_rate": 1e-06, "loss": 0.0005, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3918.0, "completions/mean_length": 1762.474609375, "completions/mean_terminated_length": 1676.2967529296875, "completions/min_length": 469.0, "completions/min_terminated_length": 469.0, "entropy": 0.23500239849090576, "epoch": 1.6362179487179487, "frac_reward_zero_std": 0.125, "grad_norm": 0.02729848027229309, "learning_rate": 1e-06, "loss": -0.0103, "num_tokens": 808198689.0, "reward": 0.29712605476379395, "reward_std": 0.09707895666360855, "rewards/progression_diversity/mean": -0.0002862706023734063, "rewards/progression_diversity/std": 0.004292279481887817, "rewards/symbolic_reward_accuracy/mean": 0.1484375, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.695507824420929, "rewards/symbolic_reward_partial_score/std": 0.23922735452651978, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.047842025756836, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 0.4406740665435791, "step": 1021 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.23327183723449707, "epoch": 1.6378205128205128, "grad_norm": 20.64282989501953, "learning_rate": 1e-06, "loss": 0.0436, "step": 1022 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22972074151039124, "epoch": 1.6394230769230769, "grad_norm": 0.02955857291817665, "learning_rate": 1e-06, "loss": 0.0107, "step": 1023 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.23090046644210815, "epoch": 1.641025641025641, "grad_norm": 0.025243405252695084, "learning_rate": 1e-06, "loss": -0.0164, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4233.0, "completions/mean_length": 1425.66796875, "completions/mean_terminated_length": 1396.395263671875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "entropy": 0.24533987045288086, "epoch": 1.6426282051282053, "frac_reward_zero_std": 0.15625, "grad_norm": 0.031315580010414124, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 809735287.0, "reward": 0.2972439229488373, "reward_std": 0.09149158000946045, "rewards/progression_diversity/mean": -0.00021706035477109253, "rewards/progression_diversity/std": 0.0027459680568426847, "rewards/symbolic_reward_accuracy/mean": 0.1640625, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.6633463501930237, "rewards/symbolic_reward_partial_score/std": 0.2075018286705017, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0503005981445312, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 704.0, "sampling/sampling_logp_difference/mean": 0.17904753983020782, "step": 1025 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.23708289116621017, "epoch": 1.6442307692307692, "grad_norm": 0.04043427109718323, "learning_rate": 1e-06, "loss": 0.0122, "step": 1026 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2370290458202362, "epoch": 1.6458333333333335, "grad_norm": 0.019201675429940224, "learning_rate": 1e-06, "loss": 0.0346, "step": 1027 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23796197026968002, "epoch": 1.6474358974358974, "grad_norm": 0.021262312307953835, "learning_rate": 1e-06, "loss": 0.0163, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4151.0, "completions/max_terminated_length": 4151.0, "completions/mean_length": 1320.484375, "completions/mean_terminated_length": 1320.484375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.22943055629730225, "epoch": 1.6490384615384617, "frac_reward_zero_std": 0.21875, "grad_norm": 0.03114836849272251, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 811322847.0, "reward": 0.31148767471313477, "reward_std": 0.11784271150827408, "rewards/progression_diversity/mean": -0.00015914140385575593, "rewards/progression_diversity/std": 0.0020391715224832296, "rewards/symbolic_reward_accuracy/mean": 0.181640625, "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, "rewards/symbolic_reward_partial_score/mean": 0.6750162839889526, "rewards/symbolic_reward_partial_score/std": 0.22747963666915894, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0495221614837646, "sampling/importance_sampling_ratio/min": 0.0016271222848445177, "sampling/sampling_logp_difference/max": 6.420942306518555, "sampling/sampling_logp_difference/mean": 0.10276812314987183, "step": 1029 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.23415642231702805, "epoch": 1.6506410256410255, "grad_norm": 0.017106305807828903, "learning_rate": 1e-06, "loss": -0.007, "step": 1030 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2420409992337227, "epoch": 1.6522435897435899, "grad_norm": 0.021166684105992317, "learning_rate": 1e-06, "loss": 0.0165, "step": 1031 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.22991270571947098, "epoch": 1.6538461538461537, "grad_norm": 0.026932181790471077, "learning_rate": 1e-06, "loss": 0.0001, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3803.0, "completions/mean_length": 1430.66015625, "completions/mean_terminated_length": 1372.0196533203125, "completions/min_length": 468.0, "completions/min_terminated_length": 468.0, "entropy": 0.23568789660930634, "epoch": 1.655448717948718, "frac_reward_zero_std": 0.25, "grad_norm": 0.02257698029279709, "learning_rate": 1e-06, "loss": 0.0309, "num_tokens": 812878337.0, "reward": 0.372047483921051, "reward_std": 0.05590501427650452, "rewards/progression_diversity/mean": -0.0008204940240830183, "rewards/progression_diversity/std": 0.008694627322256565, "rewards/symbolic_reward_accuracy/mean": 0.255859375, "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, "rewards/symbolic_reward_partial_score/mean": 0.7297688722610474, "rewards/symbolic_reward_partial_score/std": 0.2233889102935791, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0493979454040527, "sampling/importance_sampling_ratio/min": 1.542988610145346e-12, "sampling/sampling_logp_difference/max": 27.19729995727539, "sampling/sampling_logp_difference/mean": 0.10240708291530609, "step": 1033 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2355349287390709, "epoch": 1.657051282051282, "grad_norm": 0.017844535410404205, "learning_rate": 1e-06, "loss": 0.0074, "step": 1034 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.23197440803050995, "epoch": 1.6586538461538463, "grad_norm": 0.020765669643878937, "learning_rate": 1e-06, "loss": 0.0046, "step": 1035 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.23954713344573975, "epoch": 1.6602564102564101, "grad_norm": 0.02009597420692444, "learning_rate": 1e-06, "loss": -0.0043, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4692.0, "completions/max_terminated_length": 4692.0, "completions/mean_length": 1252.927734375, "completions/mean_terminated_length": 1252.927734375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "entropy": 0.23540305346250534, "epoch": 1.6618589743589745, "frac_reward_zero_std": 0.375, "grad_norm": 0.022153835743665695, "learning_rate": 1e-06, "loss": 0.0036, "num_tokens": 814426508.0, "reward": 0.29029542207717896, "reward_std": 0.04895063489675522, "rewards/progression_diversity/mean": -0.00024394365027546883, "rewards/progression_diversity/std": 0.0027705603279173374, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.7176594734191895, "rewards/symbolic_reward_partial_score/std": 0.19320081174373627, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0504940748214722, "sampling/importance_sampling_ratio/min": 0.0024849912151694298, "sampling/sampling_logp_difference/max": 5.997486114501953, "sampling/sampling_logp_difference/mean": 0.10428472608327866, "step": 1037 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.24068902432918549, "epoch": 1.6634615384615383, "grad_norm": 0.021785540506243706, "learning_rate": 1e-06, "loss": 0.0068, "step": 1038 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.23235713690519333, "epoch": 1.6650641025641026, "grad_norm": 0.020368505269289017, "learning_rate": 1e-06, "loss": 0.0047, "step": 1039 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.236878864467144, "epoch": 1.6666666666666665, "grad_norm": 0.0215722918510437, "learning_rate": 1e-06, "loss": 0.0003, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4650.0, "completions/mean_length": 1380.171875, "completions/mean_terminated_length": 1291.74072265625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.23111912608146667, "epoch": 1.6682692307692308, "frac_reward_zero_std": 0.1875, "grad_norm": 2.0536277294158936, "learning_rate": 1e-06, "loss": 0.0259, "num_tokens": 816040052.0, "reward": 0.26442086696624756, "reward_std": 0.0863037258386612, "rewards/progression_diversity/mean": -0.0022489791736006737, "rewards/progression_diversity/std": 0.027088049799203873, "rewards/symbolic_reward_accuracy/mean": 0.123046875, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.6373372077941895, "rewards/symbolic_reward_partial_score/std": 0.2190377116203308, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.047485589981079, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 0.2174200415611267, "step": 1041 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.23076869547367096, "epoch": 1.6698717948717947, "grad_norm": 0.016981706023216248, "learning_rate": 1e-06, "loss": -0.0025, "step": 1042 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.22752298414707184, "epoch": 1.671474358974359, "grad_norm": 0.019416002556681633, "learning_rate": 1e-06, "loss": 0.0329, "step": 1043 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.23331021517515182, "epoch": 1.6730769230769231, "grad_norm": 0.015192318707704544, "learning_rate": 1e-06, "loss": 0.0003, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3838.0, "completions/max_terminated_length": 3838.0, "completions/mean_length": 1214.56640625, "completions/mean_terminated_length": 1214.56640625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.24394644051790237, "epoch": 1.6746794871794872, "frac_reward_zero_std": 0.28125, "grad_norm": 0.03186223655939102, "learning_rate": 1e-06, "loss": 0.0053, "num_tokens": 817475990.0, "reward": 0.31599071621894836, "reward_std": 0.08919353783130646, "rewards/progression_diversity/mean": -4.969753717887215e-05, "rewards/progression_diversity/std": 0.0011245269561186433, "rewards/symbolic_reward_accuracy/mean": 0.19140625, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.6704915165901184, "rewards/symbolic_reward_partial_score/std": 0.22778773307800293, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0511565208435059, "sampling/importance_sampling_ratio/min": 0.00036043746513314545, "sampling/sampling_logp_difference/max": 7.928192138671875, "sampling/sampling_logp_difference/mean": 0.1060151681303978, "step": 1045 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.23804038763046265, "epoch": 1.6762820512820513, "grad_norm": 0.012359860353171825, "learning_rate": 1e-06, "loss": 0.0012, "step": 1046 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.24049938470125198, "epoch": 1.6778846153846154, "grad_norm": 0.01780606620013714, "learning_rate": 1e-06, "loss": 0.0041, "step": 1047 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2433236464858055, "epoch": 1.6794871794871795, "grad_norm": 0.018220573663711548, "learning_rate": 1e-06, "loss": -0.0022, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3815.0, "completions/max_terminated_length": 3815.0, "completions/mean_length": 1282.876953125, "completions/mean_terminated_length": 1282.876953125, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "entropy": 0.24433332681655884, "epoch": 1.6810897435897436, "frac_reward_zero_std": 0.1875, "grad_norm": 0.032106902450323105, "learning_rate": 1e-06, "loss": -0.0046, "num_tokens": 818907495.0, "reward": 0.29857733845710754, "reward_std": 0.13391146063804626, "rewards/progression_diversity/mean": -0.0001772599498508498, "rewards/progression_diversity/std": 0.0024098155554383993, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.6280761957168579, "rewards/symbolic_reward_partial_score/std": 0.25627195835113525, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0512542724609375, "sampling/importance_sampling_ratio/min": 9.631957800593227e-05, "sampling/sampling_logp_difference/max": 9.247838973999023, "sampling/sampling_logp_difference/mean": 0.10506170988082886, "step": 1049 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.24064549058675766, "epoch": 1.6826923076923077, "grad_norm": 0.01762561686336994, "learning_rate": 1e-06, "loss": 0.0004, "step": 1050 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.24004538357257843, "epoch": 1.6842948717948718, "grad_norm": 0.022497214376926422, "learning_rate": 1e-06, "loss": 0.0018, "step": 1051 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2305016741156578, "epoch": 1.685897435897436, "grad_norm": 0.02127472683787346, "learning_rate": 1e-06, "loss": -0.0066, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3644.0, "completions/max_terminated_length": 3644.0, "completions/mean_length": 1173.380859375, "completions/mean_terminated_length": 1173.380859375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "entropy": 0.23441405594348907, "epoch": 1.6875, "frac_reward_zero_std": 0.21875, "grad_norm": 0.02767036482691765, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 820447722.0, "reward": 0.2577317953109741, "reward_std": 0.04780818521976471, "rewards/progression_diversity/mean": -0.0002602554450277239, "rewards/progression_diversity/std": 0.0028319996781647205, "rewards/symbolic_reward_accuracy/mean": 0.11328125, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.6325520873069763, "rewards/symbolic_reward_partial_score/std": 0.20778940618038177, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0493583679199219, "sampling/importance_sampling_ratio/min": 0.0004806618671864271, "sampling/sampling_logp_difference/max": 7.640346527099609, "sampling/sampling_logp_difference/mean": 0.10264922678470612, "step": 1053 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.22839296609163284, "epoch": 1.689102564102564, "grad_norm": 0.01796301268041134, "learning_rate": 1e-06, "loss": -0.0, "step": 1054 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.22674311697483063, "epoch": 1.6907051282051282, "grad_norm": 0.01843862235546112, "learning_rate": 1e-06, "loss": 0.0064, "step": 1055 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.22671975195407867, "epoch": 1.6923076923076923, "grad_norm": 0.02962394803762436, "learning_rate": 1e-06, "loss": 0.0027, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3951.0, "completions/mean_length": 1224.1875, "completions/mean_terminated_length": 1194.5205078125, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "entropy": 0.22822780907154083, "epoch": 1.6939102564102564, "frac_reward_zero_std": 0.1875, "grad_norm": 0.06622759997844696, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 821993514.0, "reward": 0.26173463463783264, "reward_std": 0.07937778532505035, "rewards/progression_diversity/mean": -0.00036702307988889515, "rewards/progression_diversity/std": 0.0035130290780216455, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.6387370228767395, "rewards/symbolic_reward_partial_score/std": 0.2266356348991394, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0486118793487549, "sampling/importance_sampling_ratio/min": 1.0303001507505627e-11, "sampling/sampling_logp_difference/max": 25.298585891723633, "sampling/sampling_logp_difference/mean": 0.10055477917194366, "step": 1057 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23284170031547546, "epoch": 1.6955128205128205, "grad_norm": 0.02792915143072605, "learning_rate": 1e-06, "loss": -0.0146, "step": 1058 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.22962763160467148, "epoch": 1.6971153846153846, "grad_norm": 0.018218589946627617, "learning_rate": 1e-06, "loss": -0.0005, "step": 1059 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2237945944070816, "epoch": 1.6987179487179487, "grad_norm": 0.01743151806294918, "learning_rate": 1e-06, "loss": 0.0005, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3416.0, "completions/mean_length": 1289.6484375, "completions/mean_terminated_length": 1260.109619140625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "entropy": 0.22563714534044266, "epoch": 1.7003205128205128, "frac_reward_zero_std": 0.21875, "grad_norm": 0.027078071609139442, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 823552902.0, "reward": 0.27887141704559326, "reward_std": 0.07552332431077957, "rewards/progression_diversity/mean": -6.769368337700143e-05, "rewards/progression_diversity/std": 0.0008789710118435323, "rewards/symbolic_reward_accuracy/mean": 0.146484375, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.637255847454071, "rewards/symbolic_reward_partial_score/std": 0.2156410962343216, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0477631092071533, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 0.3541214168071747, "step": 1061 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.22320057451725006, "epoch": 1.7019230769230769, "grad_norm": 614.7477416992188, "learning_rate": 1e-06, "loss": 0.0147, "step": 1062 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2247406244277954, "epoch": 1.703525641025641, "grad_norm": 0.016136281192302704, "learning_rate": 1e-06, "loss": 0.0032, "step": 1063 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.22823335230350494, "epoch": 1.7051282051282053, "grad_norm": 0.02105732634663582, "learning_rate": 1e-06, "loss": -0.004, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4059.0, "completions/mean_length": 1367.603515625, "completions/mean_terminated_length": 1338.2171630859375, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "entropy": 0.2308506816625595, "epoch": 1.7067307692307692, "frac_reward_zero_std": 0.1875, "grad_norm": 0.027784336358308792, "learning_rate": 1e-06, "loss": -0.003, "num_tokens": 825156523.0, "reward": 0.2746616005897522, "reward_std": 0.09659157693386078, "rewards/progression_diversity/mean": -0.0001522963575553149, "rewards/progression_diversity/std": 0.0019277379615232348, "rewards/symbolic_reward_accuracy/mean": 0.134765625, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.6466634273529053, "rewards/symbolic_reward_partial_score/std": 0.22976163029670715, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0482929944992065, "sampling/importance_sampling_ratio/min": 4.196563168079592e-06, "sampling/sampling_logp_difference/max": 12.381244659423828, "sampling/sampling_logp_difference/mean": 0.10019037127494812, "step": 1065 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.22658158838748932, "epoch": 1.7083333333333335, "grad_norm": 0.019185129553079605, "learning_rate": 1e-06, "loss": -0.004, "step": 1066 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2332712933421135, "epoch": 1.7099358974358974, "grad_norm": 0.0282441433519125, "learning_rate": 1e-06, "loss": 0.0069, "step": 1067 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.23132744431495667, "epoch": 1.7115384615384617, "grad_norm": 0.160556823015213, "learning_rate": 1e-06, "loss": 0.0127, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4672.0, "completions/max_terminated_length": 4672.0, "completions/mean_length": 1349.27734375, "completions/mean_terminated_length": 1349.27734375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.22879090905189514, "epoch": 1.7131410256410255, "frac_reward_zero_std": 0.1875, "grad_norm": 0.10462523251771927, "learning_rate": 1e-06, "loss": 0.0044, "num_tokens": 826677881.0, "reward": 0.2972085177898407, "reward_std": 0.11050209403038025, "rewards/progression_diversity/mean": -0.0008295955485664308, "rewards/progression_diversity/std": 0.008723512291908264, "rewards/symbolic_reward_accuracy/mean": 0.158203125, "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, "rewards/symbolic_reward_partial_score/mean": 0.67431640625, "rewards/symbolic_reward_partial_score/std": 0.23070181906223297, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0478163957595825, "sampling/importance_sampling_ratio/min": 0.0023437354248017073, "sampling/sampling_logp_difference/max": 6.056009292602539, "sampling/sampling_logp_difference/mean": 0.09988612681627274, "step": 1069 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.22955607622861862, "epoch": 1.7147435897435899, "grad_norm": 0.022891266271471977, "learning_rate": 1e-06, "loss": 0.0089, "step": 1070 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.23325985670089722, "epoch": 1.7163461538461537, "grad_norm": 0.021752603352069855, "learning_rate": 1e-06, "loss": -0.004, "step": 1071 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.3046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2312638759613037, "epoch": 1.717948717948718, "grad_norm": 0.016995754092931747, "learning_rate": 1e-06, "loss": 0.0081, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3701.0, "completions/mean_length": 1393.966796875, "completions/mean_terminated_length": 1364.632080078125, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "entropy": 0.23279593884944916, "epoch": 1.719551282051282, "frac_reward_zero_std": 0.21875, "grad_norm": 0.02495507150888443, "learning_rate": 1e-06, "loss": 0.0083, "num_tokens": 828169640.0, "reward": 0.2605937719345093, "reward_std": 0.053633954375982285, "rewards/progression_diversity/mean": -0.0006842931034043431, "rewards/progression_diversity/std": 0.006281269248574972, "rewards/symbolic_reward_accuracy/mean": 0.111328125, "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, "rewards/symbolic_reward_partial_score/mean": 0.6466634273529053, "rewards/symbolic_reward_partial_score/std": 0.21451237797737122, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0482239723205566, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 149.9818115234375, "sampling/sampling_logp_difference/mean": 0.10541543364524841, "step": 1073 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2331898808479309, "epoch": 1.7211538461538463, "grad_norm": 0.02485121600329876, "learning_rate": 1e-06, "loss": 0.007, "step": 1074 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2273246869444847, "epoch": 1.7227564102564101, "grad_norm": 0.025766994804143906, "learning_rate": 1e-06, "loss": -0.0, "step": 1075 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2262198030948639, "epoch": 1.7243589743589745, "grad_norm": 70.85974884033203, "learning_rate": 1e-06, "loss": 0.013, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12415.0, "completions/mean_length": 1387.005859375, "completions/mean_terminated_length": 1298.614990234375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "entropy": 0.22221150994300842, "epoch": 1.7259615384615383, "frac_reward_zero_std": 0.34375, "grad_norm": 0.019594384357333183, "learning_rate": 1e-06, "loss": 0.0331, "num_tokens": 829798859.0, "reward": 0.3287256360054016, "reward_std": 0.0677950382232666, "rewards/progression_diversity/mean": -0.0004827457305509597, "rewards/progression_diversity/std": 0.005075244233012199, "rewards/symbolic_reward_accuracy/mean": 0.208984375, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.6797525882720947, "rewards/symbolic_reward_partial_score/std": 0.23522421717643738, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0458087921142578, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.522271990776062, "step": 1077 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2273356318473816, "epoch": 1.7275641025641026, "grad_norm": 0.018257969990372658, "learning_rate": 1e-06, "loss": -0.0013, "step": 1078 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.22496186941862106, "epoch": 1.7291666666666665, "grad_norm": 0.01670248992741108, "learning_rate": 1e-06, "loss": -0.002, "step": 1079 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2231815606355667, "epoch": 1.7307692307692308, "grad_norm": 0.021122116595506668, "learning_rate": 1e-06, "loss": 0.061, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4567.0, "completions/max_terminated_length": 4567.0, "completions/mean_length": 1337.560546875, "completions/mean_terminated_length": 1337.560546875, "completions/min_length": 488.0, "completions/min_terminated_length": 488.0, "entropy": 0.2179974913597107, "epoch": 1.7323717948717947, "frac_reward_zero_std": 0.25, "grad_norm": 0.02882332168519497, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 831412682.0, "reward": 0.25469720363616943, "reward_std": 0.042546242475509644, "rewards/progression_diversity/mean": -0.0004942620871588588, "rewards/progression_diversity/std": 0.004545349627733231, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.6068196892738342, "rewards/symbolic_reward_partial_score/std": 0.2223115861415863, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0473459959030151, "sampling/importance_sampling_ratio/min": 2.902779897340224e-06, "sampling/sampling_logp_difference/max": 12.749841690063477, "sampling/sampling_logp_difference/mean": 0.09889820963144302, "step": 1081 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22881120443344116, "epoch": 1.733974358974359, "grad_norm": 0.024050477892160416, "learning_rate": 1e-06, "loss": 0.0022, "step": 1082 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.22975780814886093, "epoch": 1.7355769230769231, "grad_norm": 0.014919934794306755, "learning_rate": 1e-06, "loss": 0.0094, "step": 1083 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.3125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.22842169553041458, "epoch": 1.7371794871794872, "grad_norm": 0.015330510213971138, "learning_rate": 1e-06, "loss": -0.0049, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3636.0, "completions/max_terminated_length": 3636.0, "completions/mean_length": 1231.79296875, "completions/mean_terminated_length": 1231.79296875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.23932421952486038, "epoch": 1.7387820512820513, "frac_reward_zero_std": 0.3125, "grad_norm": 0.02318231202661991, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 832881792.0, "reward": 0.25613856315612793, "reward_std": 0.03916171193122864, "rewards/progression_diversity/mean": -0.0004032338911201805, "rewards/progression_diversity/std": 0.0032340919133275747, "rewards/symbolic_reward_accuracy/mean": 0.09765625, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.658496081829071, "rewards/symbolic_reward_partial_score/std": 0.20491082966327667, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.050029993057251, "sampling/importance_sampling_ratio/min": 0.0007102126837708056, "sampling/sampling_logp_difference/max": 7.249946117401123, "sampling/sampling_logp_difference/mean": 0.10211381316184998, "step": 1085 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.23092254996299744, "epoch": 1.7403846153846154, "grad_norm": 0.013799590058624744, "learning_rate": 1e-06, "loss": -0.0024, "step": 1086 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.23684141784906387, "epoch": 1.7419871794871795, "grad_norm": 0.018218791112303734, "learning_rate": 1e-06, "loss": 0.0023, "step": 1087 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2406981661915779, "epoch": 1.7435897435897436, "grad_norm": 0.012886385433375835, "learning_rate": 1e-06, "loss": 0.0037, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4425.0, "completions/mean_length": 1411.501953125, "completions/mean_terminated_length": 1233.9625244140625, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.22861982136964798, "epoch": 1.7451923076923077, "frac_reward_zero_std": 0.28125, "grad_norm": 152.24722290039062, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 834562321.0, "reward": 0.2795882225036621, "reward_std": 0.03494984656572342, "rewards/progression_diversity/mean": -0.0011390319559723139, "rewards/progression_diversity/std": 0.019081035628914833, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.6273112297058105, "rewards/symbolic_reward_partial_score/std": 0.21833449602127075, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0453113317489624, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 0.9983885884284973, "step": 1089 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2288360819220543, "epoch": 1.7467948717948718, "grad_norm": 0.01483655534684658, "learning_rate": 1e-06, "loss": 0.0598, "step": 1090 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2326875552535057, "epoch": 1.748397435897436, "grad_norm": 136.22293090820312, "learning_rate": 1e-06, "loss": 0.0291, "step": 1091 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.233309805393219, "epoch": 1.75, "grad_norm": 0.023925775662064552, "learning_rate": 1e-06, "loss": 0.0074, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3879.0, "completions/mean_length": 1282.169921875, "completions/mean_terminated_length": 1252.616455078125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "entropy": 0.23763851076364517, "epoch": 1.751602564102564, "frac_reward_zero_std": 0.1875, "grad_norm": 0.028567634522914886, "learning_rate": 1e-06, "loss": 0.0098, "num_tokens": 836112856.0, "reward": 0.2890186309814453, "reward_std": 0.06066850945353508, "rewards/progression_diversity/mean": -0.00048197241267189384, "rewards/progression_diversity/std": 0.0038850673008710146, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.6984374523162842, "rewards/symbolic_reward_partial_score/std": 0.21642756462097168, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0489928722381592, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.8276915550231934, "step": 1093 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23289386928081512, "epoch": 1.7532051282051282, "grad_norm": 0.015758074820041656, "learning_rate": 1e-06, "loss": -0.0056, "step": 1094 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.23799997568130493, "epoch": 1.7548076923076923, "grad_norm": 0.026696138083934784, "learning_rate": 1e-06, "loss": 0.0046, "step": 1095 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2384975180029869, "epoch": 1.7564102564102564, "grad_norm": 0.02093910239636898, "learning_rate": 1e-06, "loss": 0.014, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3248.0, "completions/max_terminated_length": 3248.0, "completions/mean_length": 1161.291015625, "completions/mean_terminated_length": 1161.291015625, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "entropy": 0.23988765478134155, "epoch": 1.7580128205128205, "frac_reward_zero_std": 0.3125, "grad_norm": 0.025087056681513786, "learning_rate": 1e-06, "loss": 0.0066, "num_tokens": 837526381.0, "reward": 0.32568255066871643, "reward_std": 0.07381178438663483, "rewards/progression_diversity/mean": -0.00010529650171520188, "rewards/progression_diversity/std": 0.0017732558771967888, "rewards/symbolic_reward_accuracy/mean": 0.21484375, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.6559244990348816, "rewards/symbolic_reward_partial_score/std": 0.2370387762784958, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.051661729812622, "sampling/importance_sampling_ratio/min": 8.779976212736074e-08, "sampling/sampling_logp_difference/max": 16.248207092285156, "sampling/sampling_logp_difference/mean": 0.10443771630525589, "step": 1097 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.24236983805894852, "epoch": 1.7596153846153846, "grad_norm": 0.01639835350215435, "learning_rate": 1e-06, "loss": -0.0026, "step": 1098 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23802829533815384, "epoch": 1.7612179487179487, "grad_norm": 0.01632961444556713, "learning_rate": 1e-06, "loss": -0.0023, "step": 1099 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.23437372595071793, "epoch": 1.7628205128205128, "grad_norm": 0.013542949222028255, "learning_rate": 1e-06, "loss": 0.0084, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3300.0, "completions/mean_length": 1334.92578125, "completions/mean_terminated_length": 1275.909912109375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.22916942834854126, "epoch": 1.7644230769230769, "frac_reward_zero_std": 0.40625, "grad_norm": 0.020054111257195473, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 839122695.0, "reward": 0.2580447793006897, "reward_std": 0.033093225210905075, "rewards/progression_diversity/mean": -0.00021209442638792098, "rewards/progression_diversity/std": 0.0026572090573608875, "rewards/symbolic_reward_accuracy/mean": 0.107421875, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.6466145515441895, "rewards/symbolic_reward_partial_score/std": 0.1929846704006195, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0464985370635986, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.6539708971977234, "step": 1101 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.22496525943279266, "epoch": 1.766025641025641, "grad_norm": 698.2822265625, "learning_rate": 1e-06, "loss": 0.0585, "step": 1102 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22553813457489014, "epoch": 1.7676282051282053, "grad_norm": 0.027719808742403984, "learning_rate": 1e-06, "loss": 0.0128, "step": 1103 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2250175029039383, "epoch": 1.7692307692307692, "grad_norm": 0.015375426970422268, "learning_rate": 1e-06, "loss": 0.0297, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3192.0, "completions/mean_length": 1293.3046875, "completions/mean_terminated_length": 1174.4803466796875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.22259868681430817, "epoch": 1.7708333333333335, "frac_reward_zero_std": 0.34375, "grad_norm": 692.8626098632812, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 840688755.0, "reward": 0.3711351156234741, "reward_std": 0.09630028158426285, "rewards/progression_diversity/mean": -0.001723826164379716, "rewards/progression_diversity/std": 0.026012076064944267, "rewards/symbolic_reward_accuracy/mean": 0.271484375, "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, "rewards/symbolic_reward_partial_score/mean": 0.6968098878860474, "rewards/symbolic_reward_partial_score/std": 0.23859074711799622, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0450929403305054, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.8819183111190796, "step": 1105 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.22188594937324524, "epoch": 1.7724358974358974, "grad_norm": 0.017745744436979294, "learning_rate": 1e-06, "loss": -0.0115, "step": 1106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22368201613426208, "epoch": 1.7740384615384617, "grad_norm": 0.01989768259227276, "learning_rate": 1e-06, "loss": 0.037, "step": 1107 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22869247198104858, "epoch": 1.7756410256410255, "grad_norm": 0.013347904197871685, "learning_rate": 1e-06, "loss": 0.0261, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3432.0, "completions/mean_length": 1230.427734375, "completions/mean_terminated_length": 1171.0020751953125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.22080733627080917, "epoch": 1.7772435897435899, "frac_reward_zero_std": 0.34375, "grad_norm": 0.01532725803554058, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 842301870.0, "reward": 0.2819012403488159, "reward_std": 0.061507582664489746, "rewards/progression_diversity/mean": -0.0003044310724362731, "rewards/progression_diversity/std": 0.005076899658888578, "rewards/symbolic_reward_accuracy/mean": 0.142578125, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.6558268070220947, "rewards/symbolic_reward_partial_score/std": 0.21851463615894318, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0459144115447998, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.5158374905586243, "step": 1109 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2195032313466072, "epoch": 1.7788461538461537, "grad_norm": 0.027773037552833557, "learning_rate": 1e-06, "loss": -0.0005, "step": 1110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2160264179110527, "epoch": 1.780448717948718, "grad_norm": 0.016807833686470985, "learning_rate": 1e-06, "loss": 0.0238, "step": 1111 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21882987767457962, "epoch": 1.782051282051282, "grad_norm": 0.017015958204865456, "learning_rate": 1e-06, "loss": -0.0025, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3504.0, "completions/mean_length": 1344.291015625, "completions/mean_terminated_length": 1285.3118896484375, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.22031641751527786, "epoch": 1.7836538461538463, "frac_reward_zero_std": 0.375, "grad_norm": 0.02029530704021454, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 843814051.0, "reward": 0.3037402629852295, "reward_std": 0.06527281552553177, "rewards/progression_diversity/mean": -1.7595344559140358e-07, "rewards/progression_diversity/std": 3.981372174166609e-06, "rewards/symbolic_reward_accuracy/mean": 0.1640625, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.6856445074081421, "rewards/symbolic_reward_partial_score/std": 0.2108502984046936, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0456528663635254, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.39048463106155396, "step": 1113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21349480748176575, "epoch": 1.7852564102564101, "grad_norm": 0.019900523126125336, "learning_rate": 1e-06, "loss": 0.0008, "step": 1114 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.21848982572555542, "epoch": 1.7868589743589745, "grad_norm": 0.010702534578740597, "learning_rate": 1e-06, "loss": 0.0025, "step": 1115 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.22711623460054398, "epoch": 1.7884615384615383, "grad_norm": 0.021489204838871956, "learning_rate": 1e-06, "loss": 0.0038, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3124.0, "completions/max_terminated_length": 3124.0, "completions/mean_length": 1099.703125, "completions/mean_terminated_length": 1099.703125, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "entropy": 0.2246062159538269, "epoch": 1.7900641025641026, "frac_reward_zero_std": 0.5, "grad_norm": 0.01770607940852642, "learning_rate": 1e-06, "loss": 0.0047, "num_tokens": 845220171.0, "reward": 0.3277079463005066, "reward_std": 0.05349338799715042, "rewards/progression_diversity/mean": -0.00020373196457512677, "rewards/progression_diversity/std": 0.002731936750933528, "rewards/symbolic_reward_accuracy/mean": 0.1953125, "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, "rewards/symbolic_reward_partial_score/mean": 0.7017415761947632, "rewards/symbolic_reward_partial_score/std": 0.196936696767807, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0474233627319336, "sampling/importance_sampling_ratio/min": 1.3281011845833746e-08, "sampling/sampling_logp_difference/max": 18.136930465698242, "sampling/sampling_logp_difference/mean": 0.09797333925962448, "step": 1117 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21827614307403564, "epoch": 1.7916666666666665, "grad_norm": 0.011651732958853245, "learning_rate": 1e-06, "loss": 0.0005, "step": 1118 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.21932896971702576, "epoch": 1.7932692307692308, "grad_norm": 0.010273098014295101, "learning_rate": 1e-06, "loss": -0.0024, "step": 1119 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2230270653963089, "epoch": 1.7948717948717947, "grad_norm": 0.02401752397418022, "learning_rate": 1e-06, "loss": 0.0044, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3138.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 1054.958984375, "completions/mean_terminated_length": 1054.958984375, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.21830987185239792, "epoch": 1.796474358974359, "frac_reward_zero_std": 0.3125, "grad_norm": 0.015644196420907974, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 846662438.0, "reward": 0.3441348671913147, "reward_std": 0.06147376075387001, "rewards/progression_diversity/mean": -8.792350126896054e-05, "rewards/progression_diversity/std": 0.00172401312738657, "rewards/symbolic_reward_accuracy/mean": 0.216796875, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.7135253548622131, "rewards/symbolic_reward_partial_score/std": 0.2144842892885208, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0474189519882202, "sampling/importance_sampling_ratio/min": 4.838908353121951e-05, "sampling/sampling_logp_difference/max": 9.936236381530762, "sampling/sampling_logp_difference/mean": 0.0984996110200882, "step": 1121 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22250553965568542, "epoch": 1.7980769230769231, "grad_norm": 0.020056243985891342, "learning_rate": 1e-06, "loss": 0.007, "step": 1122 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22270743548870087, "epoch": 1.7996794871794872, "grad_norm": 0.018208522349596024, "learning_rate": 1e-06, "loss": 0.0082, "step": 1123 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.22166861593723297, "epoch": 1.8012820512820513, "grad_norm": 0.014248647727072239, "learning_rate": 1e-06, "loss": -0.0053, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5645.0, "completions/max_terminated_length": 5645.0, "completions/mean_length": 1187.763671875, "completions/mean_terminated_length": 1187.763671875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.21534562855958939, "epoch": 1.8028846153846154, "frac_reward_zero_std": 0.375, "grad_norm": 0.024967024102807045, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 848091485.0, "reward": 0.4134520888328552, "reward_std": 0.04205480217933655, "rewards/progression_diversity/mean": -0.0004985664272680879, "rewards/progression_diversity/std": 0.007882904261350632, "rewards/symbolic_reward_accuracy/mean": 0.30078125, "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, "rewards/symbolic_reward_partial_score/mean": 0.7766276001930237, "rewards/symbolic_reward_partial_score/std": 0.2088257223367691, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0464260578155518, "sampling/importance_sampling_ratio/min": 1.6780650184955448e-05, "sampling/sampling_logp_difference/max": 10.995284080505371, "sampling/sampling_logp_difference/mean": 0.09675593674182892, "step": 1125 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22125839442014694, "epoch": 1.8044871794871795, "grad_norm": 0.012069555930793285, "learning_rate": 1e-06, "loss": -0.0023, "step": 1126 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22001603245735168, "epoch": 1.8060897435897436, "grad_norm": 0.014061706140637398, "learning_rate": 1e-06, "loss": -0.0032, "step": 1127 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22054609656333923, "epoch": 1.8076923076923077, "grad_norm": 0.011219343170523643, "learning_rate": 1e-06, "loss": -0.0031, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8772.0, "completions/max_terminated_length": 8772.0, "completions/mean_length": 1137.666015625, "completions/mean_terminated_length": 1137.666015625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "entropy": 0.21648870408535004, "epoch": 1.8092948717948718, "frac_reward_zero_std": 0.46875, "grad_norm": 0.022340567782521248, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 849531970.0, "reward": 0.3116188943386078, "reward_std": 0.024817654862999916, "rewards/progression_diversity/mean": -0.00022150327276904136, "rewards/progression_diversity/std": 0.004627527203410864, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.6793619394302368, "rewards/symbolic_reward_partial_score/std": 0.1976500004529953, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0465974807739258, "sampling/importance_sampling_ratio/min": 5.26275156570658e-19, "sampling/sampling_logp_difference/max": 42.088462829589844, "sampling/sampling_logp_difference/mean": 0.09620961546897888, "step": 1129 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.21649357676506042, "epoch": 1.810897435897436, "grad_norm": 0.01756151393055916, "learning_rate": 1e-06, "loss": 0.0014, "step": 1130 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21943299472332, "epoch": 1.8125, "grad_norm": 0.015073378570377827, "learning_rate": 1e-06, "loss": -0.007, "step": 1131 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2232174053788185, "epoch": 1.814102564102564, "grad_norm": 0.01907176896929741, "learning_rate": 1e-06, "loss": 0.0046, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3176.0, "completions/max_terminated_length": 3176.0, "completions/mean_length": 1107.029296875, "completions/mean_terminated_length": 1107.029296875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.2129407450556755, "epoch": 1.8157051282051282, "frac_reward_zero_std": 0.34375, "grad_norm": 0.025003377348184586, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 850939665.0, "reward": 0.28454291820526123, "reward_std": 0.03294537216424942, "rewards/progression_diversity/mean": -0.00029976246878504753, "rewards/progression_diversity/std": 0.002749260514974594, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.682861328125, "rewards/symbolic_reward_partial_score/std": 0.1956394761800766, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0451654195785522, "sampling/importance_sampling_ratio/min": 8.991118960466338e-08, "sampling/sampling_logp_difference/max": 16.224443435668945, "sampling/sampling_logp_difference/mean": 0.09530235081911087, "step": 1133 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.21520614624023438, "epoch": 1.8173076923076923, "grad_norm": 0.008025935851037502, "learning_rate": 1e-06, "loss": -0.004, "step": 1134 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.21551941335201263, "epoch": 1.8189102564102564, "grad_norm": 0.021345248445868492, "learning_rate": 1e-06, "loss": 0.0012, "step": 1135 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.21580560505390167, "epoch": 1.8205128205128205, "grad_norm": 0.0155746815726161, "learning_rate": 1e-06, "loss": 0.0006, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3362.0, "completions/max_terminated_length": 3362.0, "completions/mean_length": 1122.947265625, "completions/mean_terminated_length": 1122.947265625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "entropy": 0.21716423332691193, "epoch": 1.8221153846153846, "frac_reward_zero_std": 0.25, "grad_norm": 0.02500241994857788, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 852408278.0, "reward": 0.28805452585220337, "reward_std": 0.041980091482400894, "rewards/progression_diversity/mean": -0.00020892325846944004, "rewards/progression_diversity/std": 0.001957833534106612, "rewards/symbolic_reward_accuracy/mean": 0.13671875, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.6867513656616211, "rewards/symbolic_reward_partial_score/std": 0.19679629802703857, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045684576034546, "sampling/importance_sampling_ratio/min": 4.51097449324922e-12, "sampling/sampling_logp_difference/max": 26.124507904052734, "sampling/sampling_logp_difference/mean": 0.09778503328561783, "step": 1137 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.21945731341838837, "epoch": 1.8237179487179487, "grad_norm": 0.013288813643157482, "learning_rate": 1e-06, "loss": -0.0022, "step": 1138 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.22035419940948486, "epoch": 1.8253205128205128, "grad_norm": 0.025754936039447784, "learning_rate": 1e-06, "loss": 0.0074, "step": 1139 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22086691856384277, "epoch": 1.8269230769230769, "grad_norm": 0.018110549077391624, "learning_rate": 1e-06, "loss": 0.0034, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3428.0, "completions/mean_length": 1185.529296875, "completions/mean_terminated_length": 1155.7867431640625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "entropy": 0.22858010977506638, "epoch": 1.828525641025641, "frac_reward_zero_std": 0.3125, "grad_norm": 0.029191119596362114, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 853835237.0, "reward": 0.38153761625289917, "reward_std": 0.07013949751853943, "rewards/progression_diversity/mean": -4.912294389214367e-05, "rewards/progression_diversity/std": 0.0007894500158727169, "rewards/symbolic_reward_accuracy/mean": 0.275390625, "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, "rewards/symbolic_reward_partial_score/mean": 0.7216634154319763, "rewards/symbolic_reward_partial_score/std": 0.22831861674785614, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.046138048171997, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 472.0, "sampling/sampling_logp_difference/mean": 0.10319283604621887, "step": 1141 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2211167812347412, "epoch": 1.8301282051282053, "grad_norm": 0.02721370942890644, "learning_rate": 1e-06, "loss": 0.0007, "step": 1142 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2238641008734703, "epoch": 1.8317307692307692, "grad_norm": 0.02170705609023571, "learning_rate": 1e-06, "loss": 0.0196, "step": 1143 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.23243413865566254, "epoch": 1.8333333333333335, "grad_norm": 0.014753867872059345, "learning_rate": 1e-06, "loss": -0.0014, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3178.0, "completions/mean_length": 1113.748046875, "completions/mean_terminated_length": 1083.864990234375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "entropy": 0.22820258885622025, "epoch": 1.8349358974358974, "frac_reward_zero_std": 0.46875, "grad_norm": 0.017704647034406662, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 855269764.0, "reward": 0.2694726586341858, "reward_std": 0.026778917759656906, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.126953125, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.6449869871139526, "rewards/symbolic_reward_partial_score/std": 0.22033336758613586, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0469450950622559, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 720.0, "sampling/sampling_logp_difference/mean": 0.3227856755256653, "step": 1145 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.22650671005249023, "epoch": 1.8365384615384617, "grad_norm": 199.25045776367188, "learning_rate": 1e-06, "loss": 0.0266, "step": 1146 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22604414075613022, "epoch": 1.8381410256410255, "grad_norm": 0.009444105438888073, "learning_rate": 1e-06, "loss": -0.0039, "step": 1147 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.22440266609191895, "epoch": 1.8397435897435899, "grad_norm": 0.01778390072286129, "learning_rate": 1e-06, "loss": -0.0054, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3424.0, "completions/mean_length": 1209.474609375, "completions/mean_terminated_length": 1120.037353515625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "entropy": 0.22518840432167053, "epoch": 1.8413461538461537, "frac_reward_zero_std": 0.4375, "grad_norm": 323.1822814941406, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 856870839.0, "reward": 0.28326651453971863, "reward_std": 0.05484289303421974, "rewards/progression_diversity/mean": -0.0004978624056093395, "rewards/progression_diversity/std": 0.009513349272310734, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.6402018070220947, "rewards/symbolic_reward_partial_score/std": 0.2216087281703949, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0432510375976562, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.484938621520996, "step": 1149 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.21288639307022095, "epoch": 1.842948717948718, "grad_norm": 1597.748046875, "learning_rate": 1e-06, "loss": 0.155, "step": 1150 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22490354627370834, "epoch": 1.844551282051282, "grad_norm": 0.015957126393914223, "learning_rate": 1e-06, "loss": -0.004, "step": 1151 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22101520746946335, "epoch": 1.8461538461538463, "grad_norm": 0.017633656039834023, "learning_rate": 1e-06, "loss": 0.0068, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3059.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 1138.349609375, "completions/mean_terminated_length": 1138.349609375, "completions/min_length": 356.0, "completions/min_terminated_length": 356.0, "entropy": 0.23107396066188812, "epoch": 1.8477564102564101, "frac_reward_zero_std": 0.375, "grad_norm": 0.022070132195949554, "learning_rate": 1e-06, "loss": 0.0089, "num_tokens": 858210874.0, "reward": 0.38855424523353577, "reward_std": 0.0762196034193039, "rewards/progression_diversity/mean": -4.658957914216444e-05, "rewards/progression_diversity/std": 0.0010542018571868539, "rewards/symbolic_reward_accuracy/mean": 0.267578125, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.7600260972976685, "rewards/symbolic_reward_partial_score/std": 0.22460278868675232, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0477694272994995, "sampling/importance_sampling_ratio/min": 7.687971810810268e-05, "sampling/sampling_logp_difference/max": 9.473268508911133, "sampling/sampling_logp_difference/mean": 0.10008453577756882, "step": 1153 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22674327343702316, "epoch": 1.8493589743589745, "grad_norm": 0.015364538878202438, "learning_rate": 1e-06, "loss": 0.0082, "step": 1154 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.22742107510566711, "epoch": 1.8509615384615383, "grad_norm": 0.012995360419154167, "learning_rate": 1e-06, "loss": 0.0062, "step": 1155 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.22633031755685806, "epoch": 1.8525641025641026, "grad_norm": 0.015840977430343628, "learning_rate": 1e-06, "loss": -0.0093, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 1179.333984375, "completions/mean_terminated_length": 1119.7078857421875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.22518686205148697, "epoch": 1.8541666666666665, "frac_reward_zero_std": 0.40625, "grad_norm": 0.025853393599390984, "learning_rate": 1e-06, "loss": 0.0031, "num_tokens": 859689013.0, "reward": 0.35142046213150024, "reward_std": 0.06578496098518372, "rewards/progression_diversity/mean": -0.0005352114676497877, "rewards/progression_diversity/std": 0.010669847950339317, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.7508463263511658, "rewards/symbolic_reward_partial_score/std": 0.1848256140947342, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0448198318481445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.6326540112495422, "step": 1157 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.21813210099935532, "epoch": 1.8557692307692308, "grad_norm": 0.018281714990735054, "learning_rate": 1e-06, "loss": 0.0262, "step": 1158 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.22226067632436752, "epoch": 1.8573717948717947, "grad_norm": 0.018198495730757713, "learning_rate": 1e-06, "loss": 0.0277, "step": 1159 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.22661340981721878, "epoch": 1.858974358974359, "grad_norm": 0.010464332066476345, "learning_rate": 1e-06, "loss": 0.0035, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3748.0, "completions/mean_length": 1216.416015625, "completions/mean_terminated_length": 1186.73388671875, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "entropy": 0.22897808998823166, "epoch": 1.8605769230769231, "frac_reward_zero_std": 0.53125, "grad_norm": 0.018317364156246185, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 861117626.0, "reward": 0.27255719900131226, "reward_std": 0.025438256561756134, "rewards/progression_diversity/mean": -0.00014033068146090955, "rewards/progression_diversity/std": 0.002014033030718565, "rewards/symbolic_reward_accuracy/mean": 0.12890625, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.6513671875, "rewards/symbolic_reward_partial_score/std": 0.2093709260225296, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0455472469329834, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 0.44371387362480164, "step": 1161 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.22330108284950256, "epoch": 1.8621794871794872, "grad_norm": 0.010753365233540535, "learning_rate": 1e-06, "loss": 0.0224, "step": 1162 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.22619932144880295, "epoch": 1.8637820512820513, "grad_norm": 0.012639653868973255, "learning_rate": 1e-06, "loss": -0.0069, "step": 1163 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2241613268852234, "epoch": 1.8653846153846154, "grad_norm": 0.02455468475818634, "learning_rate": 1e-06, "loss": 0.0086, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3400.0, "completions/max_terminated_length": 3400.0, "completions/mean_length": 1240.51171875, "completions/mean_terminated_length": 1240.51171875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "entropy": 0.23062631487846375, "epoch": 1.8669871794871795, "frac_reward_zero_std": 0.34375, "grad_norm": 0.010606328025460243, "learning_rate": 1e-06, "loss": -0.0041, "num_tokens": 862543008.0, "reward": 0.3666699528694153, "reward_std": 0.06977473199367523, "rewards/progression_diversity/mean": -1.885928213596344e-07, "rewards/progression_diversity/std": 4.2673682401073165e-06, "rewards/symbolic_reward_accuracy/mean": 0.251953125, "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, "rewards/symbolic_reward_partial_score/mean": 0.7183268070220947, "rewards/symbolic_reward_partial_score/std": 0.23471878468990326, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.047785997390747, "sampling/importance_sampling_ratio/min": 6.825915477293165e-08, "sampling/sampling_logp_difference/max": 16.499954223632812, "sampling/sampling_logp_difference/mean": 0.09932725131511688, "step": 1165 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.224430114030838, "epoch": 1.8685897435897436, "grad_norm": 0.021581292152404785, "learning_rate": 1e-06, "loss": -0.0059, "step": 1166 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22753822803497314, "epoch": 1.8701923076923077, "grad_norm": 0.014603802002966404, "learning_rate": 1e-06, "loss": 0.0018, "step": 1167 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.22805190086364746, "epoch": 1.8717948717948718, "grad_norm": 0.010613877326250076, "learning_rate": 1e-06, "loss": 0.0065, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3325.0, "completions/mean_length": 1174.26953125, "completions/mean_terminated_length": 1144.5048828125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.2323755919933319, "epoch": 1.873397435897436, "frac_reward_zero_std": 0.40625, "grad_norm": 0.02169196680188179, "learning_rate": 1e-06, "loss": 0.0052, "num_tokens": 863941530.0, "reward": 0.3810667097568512, "reward_std": 0.07798746973276138, "rewards/progression_diversity/mean": -0.0002637706929817796, "rewards/progression_diversity/std": 0.003273366717621684, "rewards/symbolic_reward_accuracy/mean": 0.279296875, "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, "rewards/symbolic_reward_partial_score/mean": 0.7122883796691895, "rewards/symbolic_reward_partial_score/std": 0.2458869218826294, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0482022762298584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 712.0, "sampling/sampling_logp_difference/mean": 0.4974533021450043, "step": 1169 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23275116831064224, "epoch": 1.875, "grad_norm": 0.011001808568835258, "learning_rate": 1e-06, "loss": 0.0016, "step": 1170 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.23044289648532867, "epoch": 1.876602564102564, "grad_norm": 0.016635850071907043, "learning_rate": 1e-06, "loss": -0.0069, "step": 1171 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.23118175566196442, "epoch": 1.8782051282051282, "grad_norm": 96598.3203125, "learning_rate": 1e-06, "loss": 4.3159, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 1170.333984375, "completions/mean_terminated_length": 1140.5616455078125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.22949066758155823, "epoch": 1.8798076923076923, "frac_reward_zero_std": 0.46875, "grad_norm": 0.020497862249612808, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 865447381.0, "reward": 0.44851863384246826, "reward_std": 0.04487219825387001, "rewards/progression_diversity/mean": -0.00019038662139791995, "rewards/progression_diversity/std": 0.0023014748003333807, "rewards/symbolic_reward_accuracy/mean": 0.357421875, "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, "rewards/symbolic_reward_partial_score/mean": 0.7808756828308105, "rewards/symbolic_reward_partial_score/std": 0.20934894680976868, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0473530292510986, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.7223914265632629, "step": 1173 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.23133239150047302, "epoch": 1.8814102564102564, "grad_norm": 0.010313909500837326, "learning_rate": 1e-06, "loss": -0.0036, "step": 1174 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.23350611329078674, "epoch": 1.8830128205128205, "grad_norm": 0.019430264830589294, "learning_rate": 1e-06, "loss": 0.0015, "step": 1175 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2238868772983551, "epoch": 1.8846153846153846, "grad_norm": 0.013273504562675953, "learning_rate": 1e-06, "loss": 0.0231, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3514.0, "completions/mean_length": 1243.806640625, "completions/mean_terminated_length": 1154.57177734375, "completions/min_length": 389.0, "completions/min_terminated_length": 389.0, "entropy": 0.2321956604719162, "epoch": 1.8862179487179487, "frac_reward_zero_std": 0.34375, "grad_norm": 0.014576968736946583, "learning_rate": 1e-06, "loss": 0.006, "num_tokens": 866961778.0, "reward": 0.20335838198661804, "reward_std": 0.03265673667192459, "rewards/progression_diversity/mean": -0.0005896041984669864, "rewards/progression_diversity/std": 0.008867304772138596, "rewards/symbolic_reward_accuracy/mean": 0.03125, "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, "rewards/symbolic_reward_partial_score/mean": 0.6173340082168579, "rewards/symbolic_reward_partial_score/std": 0.1687249392271042, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0458412170410156, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.8149086236953735, "step": 1177 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22551760077476501, "epoch": 1.8878205128205128, "grad_norm": 0.017989974468946457, "learning_rate": 1e-06, "loss": 0.0529, "step": 1178 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2227119281888008, "epoch": 1.8894230769230769, "grad_norm": 0.02180834859609604, "learning_rate": 1e-06, "loss": 0.005, "step": 1179 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22573918104171753, "epoch": 1.891025641025641, "grad_norm": 126.00947570800781, "learning_rate": 1e-06, "loss": 0.0381, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 1103.10546875, "completions/mean_terminated_length": 1073.2015380859375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.22882883995771408, "epoch": 1.8926282051282053, "frac_reward_zero_std": 0.46875, "grad_norm": 0.019178207963705063, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 868483272.0, "reward": 0.3014499545097351, "reward_std": 0.037228312343358994, "rewards/progression_diversity/mean": -0.001001341617666185, "rewards/progression_diversity/std": 0.01981303095817566, "rewards/symbolic_reward_accuracy/mean": 0.171875, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.6617676019668579, "rewards/symbolic_reward_partial_score/std": 0.21480776369571686, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0480608940124512, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.56421959400177, "step": 1181 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.22795479744672775, "epoch": 1.8942307692307692, "grad_norm": 0.011316453106701374, "learning_rate": 1e-06, "loss": -0.0061, "step": 1182 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.22722984105348587, "epoch": 1.8958333333333335, "grad_norm": 0.016038935631513596, "learning_rate": 1e-06, "loss": 0.0001, "step": 1183 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.22345466166734695, "epoch": 1.8974358974358974, "grad_norm": 0.0241796113550663, "learning_rate": 1e-06, "loss": 0.0253, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3469.0, "completions/max_terminated_length": 3469.0, "completions/mean_length": 1094.568359375, "completions/mean_terminated_length": 1094.568359375, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.23068898171186447, "epoch": 1.8990384615384617, "frac_reward_zero_std": 0.4375, "grad_norm": 0.025562407448887825, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 869886539.0, "reward": 0.2939690053462982, "reward_std": 0.05816289782524109, "rewards/progression_diversity/mean": -7.342880417127162e-05, "rewards/progression_diversity/std": 0.0009783387649804354, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.7142741084098816, "rewards/symbolic_reward_partial_score/std": 0.16729366779327393, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0498249530792236, "sampling/importance_sampling_ratio/min": 0.002071201568469405, "sampling/sampling_logp_difference/max": 6.17962646484375, "sampling/sampling_logp_difference/mean": 0.10211198776960373, "step": 1185 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2281019389629364, "epoch": 1.9006410256410255, "grad_norm": 0.014062655158340931, "learning_rate": 1e-06, "loss": -0.0028, "step": 1186 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.22974391281604767, "epoch": 1.9022435897435899, "grad_norm": 0.014239110052585602, "learning_rate": 1e-06, "loss": 0.0121, "step": 1187 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.22985130548477173, "epoch": 1.9038461538461537, "grad_norm": 0.015234572812914848, "learning_rate": 1e-06, "loss": -0.0055, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3333.0, "completions/max_terminated_length": 3333.0, "completions/mean_length": 1139.83984375, "completions/mean_terminated_length": 1139.83984375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.22518738359212875, "epoch": 1.905448717948718, "frac_reward_zero_std": 0.3125, "grad_norm": 0.030008932575583458, "learning_rate": 1e-06, "loss": 0.0136, "num_tokens": 871431865.0, "reward": 0.3191036581993103, "reward_std": 0.04677165299654007, "rewards/progression_diversity/mean": -0.0002812910242937505, "rewards/progression_diversity/std": 0.0030721058137714863, "rewards/symbolic_reward_accuracy/mean": 0.193359375, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.6769694089889526, "rewards/symbolic_reward_partial_score/std": 0.21019434928894043, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.048149585723877, "sampling/importance_sampling_ratio/min": 7.317296608670226e-17, "sampling/sampling_logp_difference/max": 37.15370559692383, "sampling/sampling_logp_difference/mean": 0.09954197704792023, "step": 1189 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.22382111847400665, "epoch": 1.907051282051282, "grad_norm": 0.015485767275094986, "learning_rate": 1e-06, "loss": 0.0095, "step": 1190 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22165128588676453, "epoch": 1.9086538461538463, "grad_norm": 0.019756661728024483, "learning_rate": 1e-06, "loss": -0.0091, "step": 1191 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22612737119197845, "epoch": 1.9102564102564101, "grad_norm": 0.020691078156232834, "learning_rate": 1e-06, "loss": -0.0043, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 1140.0625, "completions/mean_terminated_length": 1140.0625, "completions/min_length": 410.0, "completions/min_terminated_length": 410.0, "entropy": 0.22674325108528137, "epoch": 1.9118589743589745, "frac_reward_zero_std": 0.40625, "grad_norm": 0.016689619049429893, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 872902937.0, "reward": 0.26588261127471924, "reward_std": 0.033439576625823975, "rewards/progression_diversity/mean": -0.0001180602703243494, "rewards/progression_diversity/std": 0.0016232561320066452, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.644091784954071, "rewards/symbolic_reward_partial_score/std": 0.22011272609233856, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.048423171043396, "sampling/importance_sampling_ratio/min": 0.0005005777347832918, "sampling/sampling_logp_difference/max": 7.599747657775879, "sampling/sampling_logp_difference/mean": 0.10049919784069061, "step": 1193 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22232681512832642, "epoch": 1.9134615384615383, "grad_norm": 0.019354144111275673, "learning_rate": 1e-06, "loss": -0.0067, "step": 1194 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.22689828276634216, "epoch": 1.9150641025641026, "grad_norm": 0.015032010152935982, "learning_rate": 1e-06, "loss": 0.0079, "step": 1195 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2218078002333641, "epoch": 1.9166666666666665, "grad_norm": 0.015426430851221085, "learning_rate": 1e-06, "loss": -0.0044, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 1175.25, "completions/mean_terminated_length": 1115.60791015625, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.22400058060884476, "epoch": 1.9182692307692308, "frac_reward_zero_std": 0.53125, "grad_norm": 994.896484375, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 874303465.0, "reward": 0.30390554666519165, "reward_std": 0.03565283119678497, "rewards/progression_diversity/mean": -7.151110912673175e-05, "rewards/progression_diversity/std": 0.0012204337399452925, "rewards/symbolic_reward_accuracy/mean": 0.169921875, "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, "rewards/symbolic_reward_partial_score/mean": 0.6744791269302368, "rewards/symbolic_reward_partial_score/std": 0.20566026866436005, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0463134050369263, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 724.0, "sampling/sampling_logp_difference/mean": 0.4934929311275482, "step": 1197 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2278030663728714, "epoch": 1.9198717948717947, "grad_norm": 0.0072776079177856445, "learning_rate": 1e-06, "loss": -0.0058, "step": 1198 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.22660139203071594, "epoch": 1.921474358974359, "grad_norm": 0.011884551495313644, "learning_rate": 1e-06, "loss": 0.0251, "step": 1199 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.22588203847408295, "epoch": 1.9230769230769231, "grad_norm": 0.008959163911640644, "learning_rate": 1e-06, "loss": 0.0028, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3548.0, "completions/max_terminated_length": 3548.0, "completions/mean_length": 1056.767578125, "completions/mean_terminated_length": 1056.767578125, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "entropy": 0.22032135725021362, "epoch": 1.9246794871794872, "frac_reward_zero_std": 0.40625, "grad_norm": 0.01947578601539135, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 875797554.0, "reward": 0.24959278106689453, "reward_std": 0.034925058484077454, "rewards/progression_diversity/mean": -0.00019414318376220763, "rewards/progression_diversity/std": 0.0018961295718327165, "rewards/symbolic_reward_accuracy/mean": 0.09765625, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.6366698741912842, "rewards/symbolic_reward_partial_score/std": 0.21680741012096405, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0476453304290771, "sampling/importance_sampling_ratio/min": 1.6844656158809812e-07, "sampling/sampling_logp_difference/max": 15.596647262573242, "sampling/sampling_logp_difference/mean": 0.0985308587551117, "step": 1201 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.21852584183216095, "epoch": 1.9262820512820513, "grad_norm": 0.02164105512201786, "learning_rate": 1e-06, "loss": -0.0057, "step": 1202 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.21969875693321228, "epoch": 1.9278846153846154, "grad_norm": 0.013584469445049763, "learning_rate": 1e-06, "loss": 0.0022, "step": 1203 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2164856493473053, "epoch": 1.9294871794871795, "grad_norm": 0.01743365451693535, "learning_rate": 1e-06, "loss": 0.0023, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3204.0, "completions/max_terminated_length": 3204.0, "completions/mean_length": 1141.201171875, "completions/mean_terminated_length": 1141.201171875, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.2209923416376114, "epoch": 1.9310897435897436, "frac_reward_zero_std": 0.59375, "grad_norm": 0.022249111905694008, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 877274857.0, "reward": 0.3205749988555908, "reward_std": 0.028882907703518867, "rewards/progression_diversity/mean": -0.00011996681860182434, "rewards/progression_diversity/std": 0.0014987426111474633, "rewards/symbolic_reward_accuracy/mean": 0.17578125, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.7170247435569763, "rewards/symbolic_reward_partial_score/std": 0.19960245490074158, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0480538606643677, "sampling/importance_sampling_ratio/min": 6.5602007333354084e-12, "sampling/sampling_logp_difference/max": 25.75, "sampling/sampling_logp_difference/mean": 0.09933799505233765, "step": 1205 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.21989618986845016, "epoch": 1.9326923076923077, "grad_norm": 0.008175384253263474, "learning_rate": 1e-06, "loss": 0.0013, "step": 1206 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.22836829721927643, "epoch": 1.9342948717948718, "grad_norm": 0.016739025712013245, "learning_rate": 1e-06, "loss": -0.0014, "step": 1207 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2236020788550377, "epoch": 1.935897435897436, "grad_norm": 0.013731605373322964, "learning_rate": 1e-06, "loss": 0.0002, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3453.0, "completions/mean_length": 1140.861328125, "completions/mean_terminated_length": 1111.03125, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "entropy": 0.2177734076976776, "epoch": 1.9375, "frac_reward_zero_std": 0.40625, "grad_norm": 574.372314453125, "learning_rate": 1e-06, "loss": 0.0217, "num_tokens": 878788738.0, "reward": 0.20117032527923584, "reward_std": 0.02102627232670784, "rewards/progression_diversity/mean": -0.00015734444605186582, "rewards/progression_diversity/std": 0.002003313507884741, "rewards/symbolic_reward_accuracy/mean": 0.0234375, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.6243489980697632, "rewards/symbolic_reward_partial_score/std": 0.15533775091171265, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0462570190429688, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.6281495690345764, "step": 1209 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.22087281942367554, "epoch": 1.939102564102564, "grad_norm": 0.0154855502769351, "learning_rate": 1e-06, "loss": 0.0046, "step": 1210 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22198112308979034, "epoch": 1.9407051282051282, "grad_norm": 0.018509745597839355, "learning_rate": 1e-06, "loss": -0.0006, "step": 1211 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22084571421146393, "epoch": 1.9423076923076923, "grad_norm": 0.021920692175626755, "learning_rate": 1e-06, "loss": -0.0027, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3133.0, "completions/mean_length": 1260.185546875, "completions/mean_terminated_length": 1200.8765869140625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.22076184302568436, "epoch": 1.9439102564102564, "frac_reward_zero_std": 0.46875, "grad_norm": 0.016443662345409393, "learning_rate": 1e-06, "loss": -0.0056, "num_tokens": 880305873.0, "reward": 0.2951447665691376, "reward_std": 0.037818461656570435, "rewards/progression_diversity/mean": -0.00017372408183291554, "rewards/progression_diversity/std": 0.002410082146525383, "rewards/symbolic_reward_accuracy/mean": 0.1484375, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.6882486939430237, "rewards/symbolic_reward_partial_score/std": 0.20156751573085785, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0463939905166626, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.4395545721054077, "step": 1213 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22432439774274826, "epoch": 1.9455128205128205, "grad_norm": 752.1104125976562, "learning_rate": 1e-06, "loss": 0.0734, "step": 1214 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.22369728982448578, "epoch": 1.9471153846153846, "grad_norm": 0.005198925733566284, "learning_rate": 1e-06, "loss": 0.0301, "step": 1215 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.22393931448459625, "epoch": 1.9487179487179487, "grad_norm": 0.01498700212687254, "learning_rate": 1e-06, "loss": 0.0054, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2988.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 1089.60546875, "completions/mean_terminated_length": 1089.60546875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.2190534770488739, "epoch": 1.9503205128205128, "frac_reward_zero_std": 0.375, "grad_norm": 0.02252928353846073, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 881729991.0, "reward": 0.28866058588027954, "reward_std": 0.05070298910140991, "rewards/progression_diversity/mean": -0.00015254000027198344, "rewards/progression_diversity/std": 0.0022601871751248837, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.7278319597244263, "rewards/symbolic_reward_partial_score/std": 0.17303742468357086, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0459058284759521, "sampling/importance_sampling_ratio/min": 2.218274985921198e-08, "sampling/sampling_logp_difference/max": 17.623950958251953, "sampling/sampling_logp_difference/mean": 0.09728400409221649, "step": 1217 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21607843786478043, "epoch": 1.9519230769230769, "grad_norm": 0.030551020056009293, "learning_rate": 1e-06, "loss": -0.0004, "step": 1218 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2140772044658661, "epoch": 1.953525641025641, "grad_norm": 0.009000630117952824, "learning_rate": 1e-06, "loss": 0.0062, "step": 1219 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.21863539516925812, "epoch": 1.9551282051282053, "grad_norm": 0.01365700364112854, "learning_rate": 1e-06, "loss": 0.0039, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2953.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 1145.857421875, "completions/mean_terminated_length": 1145.857421875, "completions/min_length": 401.0, "completions/min_terminated_length": 401.0, "entropy": 0.2172141596674919, "epoch": 1.9567307692307692, "frac_reward_zero_std": 0.5, "grad_norm": 0.024405311793088913, "learning_rate": 1e-06, "loss": -0.0044, "num_tokens": 883216462.0, "reward": 0.3235936760902405, "reward_std": 0.029494676738977432, "rewards/progression_diversity/mean": -8.07223841547966e-06, "rewards/progression_diversity/std": 0.00018265389371663332, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.7192708253860474, "rewards/symbolic_reward_partial_score/std": 0.19232387840747833, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.046884298324585, "sampling/importance_sampling_ratio/min": 1.05600235755735e-20, "sampling/sampling_logp_difference/max": 45.99721145629883, "sampling/sampling_logp_difference/mean": 0.09837117791175842, "step": 1221 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.22011933475732803, "epoch": 1.9583333333333335, "grad_norm": 0.00895681418478489, "learning_rate": 1e-06, "loss": 0.0046, "step": 1222 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.21496566385030746, "epoch": 1.9599358974358974, "grad_norm": 0.014537299983203411, "learning_rate": 1e-06, "loss": 0.0008, "step": 1223 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.21989809721708298, "epoch": 1.9615384615384617, "grad_norm": 0.008813065476715565, "learning_rate": 1e-06, "loss": 0.0009, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3093.0, "completions/max_terminated_length": 3093.0, "completions/mean_length": 1287.013671875, "completions/mean_terminated_length": 1287.013671875, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.21527937054634094, "epoch": 1.9631410256410255, "frac_reward_zero_std": 0.4375, "grad_norm": 0.017107542604207993, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 884739141.0, "reward": 0.28098633885383606, "reward_std": 0.0465182401239872, "rewards/progression_diversity/mean": -1.3702083379030228e-06, "rewards/progression_diversity/std": 3.1004274205770344e-05, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.6866210699081421, "rewards/symbolic_reward_partial_score/std": 0.21217504143714905, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0459423065185547, "sampling/importance_sampling_ratio/min": 9.484894411184965e-19, "sampling/sampling_logp_difference/max": 41.49941635131836, "sampling/sampling_logp_difference/mean": 0.0953700914978981, "step": 1225 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.21589966118335724, "epoch": 1.9647435897435899, "grad_norm": 0.01544854324311018, "learning_rate": 1e-06, "loss": -0.0047, "step": 1226 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.21224521100521088, "epoch": 1.9663461538461537, "grad_norm": 0.016370629891753197, "learning_rate": 1e-06, "loss": -0.0017, "step": 1227 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.21077106148004532, "epoch": 1.967948717948718, "grad_norm": 0.009457322768867016, "learning_rate": 1e-06, "loss": 0.0047, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3146.0, "completions/max_terminated_length": 3146.0, "completions/mean_length": 1206.787109375, "completions/mean_terminated_length": 1206.787109375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "entropy": 0.21696603298187256, "epoch": 1.969551282051282, "frac_reward_zero_std": 0.5625, "grad_norm": 0.01737239398062229, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 886241192.0, "reward": 0.41241925954818726, "reward_std": 0.028087418526411057, "rewards/progression_diversity/mean": -0.00026385916862636805, "rewards/progression_diversity/std": 0.002647166606038809, "rewards/symbolic_reward_accuracy/mean": 0.322265625, "rewards/symbolic_reward_accuracy/std": 0.46780112385749817, "rewards/symbolic_reward_partial_score/mean": 0.7302083969116211, "rewards/symbolic_reward_partial_score/std": 0.22925032675266266, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0460162162780762, "sampling/importance_sampling_ratio/min": 0.00035160055267624557, "sampling/sampling_logp_difference/max": 7.953014850616455, "sampling/sampling_logp_difference/mean": 0.09595166146755219, "step": 1229 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.21299895644187927, "epoch": 1.9711538461538463, "grad_norm": 0.013916675001382828, "learning_rate": 1e-06, "loss": 0.0016, "step": 1230 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2122596800327301, "epoch": 1.9727564102564101, "grad_norm": 0.013204848393797874, "learning_rate": 1e-06, "loss": 0.001, "step": 1231 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2141348496079445, "epoch": 1.9743589743589745, "grad_norm": 0.015932990238070488, "learning_rate": 1e-06, "loss": -0.0001, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3633.0, "completions/mean_length": 1235.24609375, "completions/mean_terminated_length": 1205.600830078125, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.21588806062936783, "epoch": 1.9759615384615383, "frac_reward_zero_std": 0.34375, "grad_norm": 0.016500849276781082, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 887740710.0, "reward": 0.3222983777523041, "reward_std": 0.06374066323041916, "rewards/progression_diversity/mean": -0.00014570857456419617, "rewards/progression_diversity/std": 0.0015578385209664702, "rewards/symbolic_reward_accuracy/mean": 0.19140625, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.6921712160110474, "rewards/symbolic_reward_partial_score/std": 0.2026739865541458, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045090913772583, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.3644917607307434, "step": 1233 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.21097960323095322, "epoch": 1.9775641025641026, "grad_norm": 0.0278862826526165, "learning_rate": 1e-06, "loss": 0.0195, "step": 1234 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2159006968140602, "epoch": 1.9791666666666665, "grad_norm": 0.014327644370496273, "learning_rate": 1e-06, "loss": 0.0054, "step": 1235 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2141767367720604, "epoch": 1.9807692307692308, "grad_norm": 0.017790069803595543, "learning_rate": 1e-06, "loss": -0.0013, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3288.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 1284.09375, "completions/mean_terminated_length": 1284.09375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.210280142724514, "epoch": 1.9823717948717947, "frac_reward_zero_std": 0.28125, "grad_norm": 0.021001746878027916, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 889325798.0, "reward": 0.2985340356826782, "reward_std": 0.05770270898938179, "rewards/progression_diversity/mean": -0.00011500486289151013, "rewards/progression_diversity/std": 0.0018274638568982482, "rewards/symbolic_reward_accuracy/mean": 0.142578125, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.7099609375, "rewards/symbolic_reward_partial_score/std": 0.19229154288768768, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0444259643554688, "sampling/importance_sampling_ratio/min": 6.935766052899004e-22, "sampling/sampling_logp_difference/max": 48.72018051147461, "sampling/sampling_logp_difference/mean": 0.09380759298801422, "step": 1237 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.20352954417467117, "epoch": 1.983974358974359, "grad_norm": 0.02166086994111538, "learning_rate": 1e-06, "loss": -0.0037, "step": 1238 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.20602308958768845, "epoch": 1.9855769230769231, "grad_norm": 0.017689600586891174, "learning_rate": 1e-06, "loss": 0.01, "step": 1239 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.20504777133464813, "epoch": 1.9871794871794872, "grad_norm": 0.01678495481610298, "learning_rate": 1e-06, "loss": -0.0135, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3242.0, "completions/max_terminated_length": 3242.0, "completions/mean_length": 1257.923828125, "completions/mean_terminated_length": 1257.923828125, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.20558489859104156, "epoch": 1.9887820512820513, "frac_reward_zero_std": 0.40625, "grad_norm": 0.024906298145651817, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 890897599.0, "reward": 0.18425101041793823, "reward_std": 0.02554313838481903, "rewards/progression_diversity/mean": -0.00019268083269707859, "rewards/progression_diversity/std": 0.0028432749677449465, "rewards/symbolic_reward_accuracy/mean": 0.005859375, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.6024577021598816, "rewards/symbolic_reward_partial_score/std": 0.18746401369571686, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0438592433929443, "sampling/importance_sampling_ratio/min": 0.0005599469877779484, "sampling/sampling_logp_difference/max": 7.487668514251709, "sampling/sampling_logp_difference/mean": 0.09330444037914276, "step": 1241 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.20575018227100372, "epoch": 1.9903846153846154, "grad_norm": 0.01447804644703865, "learning_rate": 1e-06, "loss": 0.0037, "step": 1242 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.208658367395401, "epoch": 1.9919871794871795, "grad_norm": 0.026018286123871803, "learning_rate": 1e-06, "loss": 0.0001, "step": 1243 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.20609137415885925, "epoch": 1.9935897435897436, "grad_norm": 0.015114527195692062, "learning_rate": 1e-06, "loss": -0.0032, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3097.0, "completions/mean_length": 1268.546875, "completions/mean_terminated_length": 1238.9666748046875, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "entropy": 0.19605816900730133, "epoch": 1.9951923076923077, "frac_reward_zero_std": 0.5, "grad_norm": 0.026206906884908676, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 892442183.0, "reward": 0.3488447368144989, "reward_std": 0.04794444516301155, "rewards/progression_diversity/mean": -0.000291308737359941, "rewards/progression_diversity/std": 0.0030992666725069284, "rewards/symbolic_reward_accuracy/mean": 0.2265625, "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, "rewards/symbolic_reward_partial_score/mean": 0.7097005248069763, "rewards/symbolic_reward_partial_score/std": 0.21007077395915985, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0414423942565918, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.47119155526161194, "step": 1245 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.19719059765338898, "epoch": 1.9967948717948718, "grad_norm": 0.014178547076880932, "learning_rate": 1e-06, "loss": -0.0034, "step": 1246 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.19025534391403198, "epoch": 1.998397435897436, "grad_norm": 0.00976389180868864, "learning_rate": 1e-06, "loss": 0.0362, "step": 1247 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.19911998510360718, "epoch": 2.0, "grad_norm": 0.021017316728830338, "learning_rate": 1e-06, "loss": 0.0021, "step": 1248 }, { "epoch": 2.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.00244140625, "eval_completions/max_length": 6023.21875, "eval_completions/max_terminated_length": 3145.0, "eval_completions/mean_length": 1369.302734375, "eval_completions/mean_terminated_length": 1332.704662322998, "eval_completions/min_length": 460.25, "eval_completions/min_terminated_length": 460.25, "eval_entropy": 0.19390225457027555, "eval_frac_reward_zero_std": 0.28125, "eval_loss": 0.00268191983923316, "eval_num_tokens": 892442183.0, "eval_reward": 0.23870036005973816, "eval_reward_std": 0.03971031281980686, "eval_rewards/progression_diversity/mean": -0.0003259587915636075, "eval_rewards/progression_diversity/std": 0.002640488281031139, "eval_rewards/symbolic_reward_accuracy/mean": 0.079345703125, "eval_rewards/symbolic_reward_accuracy/std": 0.19223300064913929, "eval_rewards/symbolic_reward_partial_score/mean": 0.6377197187393904, "eval_rewards/symbolic_reward_partial_score/std": 0.1931128588039428, "eval_rewards/tag_count_reward/mean": -0.002197265625, "eval_rewards/tag_count_reward/std": 0.019685723586007953, "eval_runtime": 1580.0116, "eval_samples_per_second": 0.158, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0408074893057346, "eval_sampling/importance_sampling_ratio/min": 0.00181254785630403, "eval_sampling/sampling_logp_difference/max": 151.67179602384567, "eval_sampling/sampling_logp_difference/mean": 0.2478228227701038, "eval_steps_per_second": 0.001, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3355.0, "completions/mean_length": 1373.986328125, "completions/mean_terminated_length": 1315.1236572265625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.20005454123020172, "epoch": 2.0016025641025643, "frac_reward_zero_std": 0.15625, "grad_norm": 233.30934143066406, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 893940272.0, "reward": 0.36688268184661865, "reward_std": 0.056143928319215775, "rewards/progression_diversity/mean": -0.001186927082017064, "rewards/progression_diversity/std": 0.01717980019748211, "rewards/symbolic_reward_accuracy/mean": 0.240234375, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.7431640625, "rewards/symbolic_reward_partial_score/std": 0.20910829305648804, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0366449356079102, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 2.187889575958252, "step": 1249 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.19553004205226898, "epoch": 2.003205128205128, "grad_norm": 0.019861171022057533, "learning_rate": 1e-06, "loss": -0.0016, "step": 1250 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.19362886995077133, "epoch": 2.0048076923076925, "grad_norm": 0.02055576629936695, "learning_rate": 1e-06, "loss": 0.0059, "step": 1251 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.19335567951202393, "epoch": 2.0064102564102564, "grad_norm": 0.017372747883200645, "learning_rate": 1e-06, "loss": 0.0263, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3457.0, "completions/mean_length": 1377.3046875, "completions/mean_terminated_length": 1347.9373779296875, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.1914682313799858, "epoch": 2.0080128205128207, "frac_reward_zero_std": 0.28125, "grad_norm": 0.020828014239668846, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 895523628.0, "reward": 0.28705087304115295, "reward_std": 0.06678985804319382, "rewards/progression_diversity/mean": -0.00047905254177749157, "rewards/progression_diversity/std": 0.005628896411508322, "rewards/symbolic_reward_accuracy/mean": 0.13671875, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.6840657591819763, "rewards/symbolic_reward_partial_score/std": 0.20577622950077057, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0396616458892822, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.37476086616516113, "step": 1253 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.19290538132190704, "epoch": 2.0096153846153846, "grad_norm": 1153.7135009765625, "learning_rate": 1e-06, "loss": 0.144, "step": 1254 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.19067180156707764, "epoch": 2.011217948717949, "grad_norm": 0.014077689498662949, "learning_rate": 1e-06, "loss": -0.0025, "step": 1255 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.19498515874147415, "epoch": 2.0128205128205128, "grad_norm": 0.017978975549340248, "learning_rate": 1e-06, "loss": 0.0053, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3308.0, "completions/mean_length": 1435.3828125, "completions/mean_terminated_length": 1376.7608642578125, "completions/min_length": 418.0, "completions/min_terminated_length": 418.0, "entropy": 0.19777005165815353, "epoch": 2.014423076923077, "frac_reward_zero_std": 0.125, "grad_norm": 0.028694914653897285, "learning_rate": 1e-06, "loss": 0.0114, "num_tokens": 897164496.0, "reward": 0.3483375608921051, "reward_std": 0.07038730382919312, "rewards/progression_diversity/mean": -0.0007183193229138851, "rewards/progression_diversity/std": 0.0077925813384354115, "rewards/symbolic_reward_accuracy/mean": 0.232421875, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.6976073980331421, "rewards/symbolic_reward_partial_score/std": 0.23062647879123688, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0391186475753784, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.5552532076835632, "step": 1257 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.19494935125112534, "epoch": 2.016025641025641, "grad_norm": 0.016217295080423355, "learning_rate": 1e-06, "loss": 0.0099, "step": 1258 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.1985015571117401, "epoch": 2.0176282051282053, "grad_norm": 0.02569577842950821, "learning_rate": 1e-06, "loss": 0.0176, "step": 1259 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.1980082392692566, "epoch": 2.019230769230769, "grad_norm": 0.01939314603805542, "learning_rate": 1e-06, "loss": 0.0001, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3630.0, "completions/mean_length": 1615.064453125, "completions/mean_terminated_length": 1469.4141845703125, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "entropy": 0.18866483867168427, "epoch": 2.0208333333333335, "frac_reward_zero_std": 0.375, "grad_norm": 20.04486656188965, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 898821809.0, "reward": 0.36717134714126587, "reward_std": 0.06693032383918762, "rewards/progression_diversity/mean": -0.0011277215089648962, "rewards/progression_diversity/std": 0.00911035481840372, "rewards/symbolic_reward_accuracy/mean": 0.244140625, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.7389160394668579, "rewards/symbolic_reward_partial_score/std": 0.21004585921764374, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0375559329986572, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.6628260612487793, "step": 1261 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.18701110780239105, "epoch": 2.0224358974358974, "grad_norm": 0.018055502325296402, "learning_rate": 1e-06, "loss": 0.02, "step": 1262 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.18735942244529724, "epoch": 2.0240384615384617, "grad_norm": 0.02284780703485012, "learning_rate": 1e-06, "loss": 0.0494, "step": 1263 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.1921609714627266, "epoch": 2.0256410256410255, "grad_norm": 0.03690061345696449, "learning_rate": 1e-06, "loss": 0.0227, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3370.0, "completions/mean_length": 1423.083984375, "completions/mean_terminated_length": 1364.413818359375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.1979808807373047, "epoch": 2.02724358974359, "frac_reward_zero_std": 0.40625, "grad_norm": 0.024405112490057945, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 900394908.0, "reward": 0.3067031800746918, "reward_std": 0.04862578958272934, "rewards/progression_diversity/mean": -9.318315278505906e-05, "rewards/progression_diversity/std": 0.001644739182665944, "rewards/symbolic_reward_accuracy/mean": 0.14453125, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.7345865964889526, "rewards/symbolic_reward_partial_score/std": 0.20190957188606262, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0397248268127441, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.6828402876853943, "step": 1265 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.19661108404397964, "epoch": 2.0288461538461537, "grad_norm": 0.013450069352984428, "learning_rate": 1e-06, "loss": 0.054, "step": 1266 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.19856248795986176, "epoch": 2.030448717948718, "grad_norm": 0.017992835491895676, "learning_rate": 1e-06, "loss": -0.0069, "step": 1267 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.19879474490880966, "epoch": 2.032051282051282, "grad_norm": 0.019288472831249237, "learning_rate": 1e-06, "loss": -0.0147, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 9273.0, "completions/mean_length": 1691.78515625, "completions/mean_terminated_length": 1546.8914794921875, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.19042234122753143, "epoch": 2.0336538461538463, "frac_reward_zero_std": 0.4375, "grad_norm": 47.59299087524414, "learning_rate": 1e-06, "loss": 0.0107, "num_tokens": 902148702.0, "reward": 0.3654331862926483, "reward_std": 0.061046987771987915, "rewards/progression_diversity/mean": -0.0011158745037391782, "rewards/progression_diversity/std": 0.014414801262319088, "rewards/symbolic_reward_accuracy/mean": 0.232421875, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.7565592527389526, "rewards/symbolic_reward_partial_score/std": 0.20248234272003174, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0370126962661743, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 1.0821824073791504, "step": 1269 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.18952977657318115, "epoch": 2.03525641025641, "grad_norm": 0.0266401544213295, "learning_rate": 1e-06, "loss": 0.1359, "step": 1270 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.18822447210550308, "epoch": 2.0368589743589745, "grad_norm": 0.027362341061234474, "learning_rate": 1e-06, "loss": 0.0679, "step": 1271 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.18995289504528046, "epoch": 2.0384615384615383, "grad_norm": 0.014087039045989513, "learning_rate": 1e-06, "loss": 0.0075, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3306.0, "completions/mean_length": 1566.1796875, "completions/mean_terminated_length": 1420.04736328125, "completions/min_length": 428.0, "completions/min_terminated_length": 428.0, "entropy": 0.19420960545539856, "epoch": 2.0400641025641026, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01688203774392605, "learning_rate": 1e-06, "loss": 0.0064, "num_tokens": 903831514.0, "reward": 0.36713409423828125, "reward_std": 0.03999203443527222, "rewards/progression_diversity/mean": -0.00046001013834029436, "rewards/progression_diversity/std": 0.004506079014390707, "rewards/symbolic_reward_accuracy/mean": 0.240234375, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.74658203125, "rewards/symbolic_reward_partial_score/std": 0.20671479403972626, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0370961427688599, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.4572553634643555, "step": 1273 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.1954876109957695, "epoch": 2.0416666666666665, "grad_norm": 0.018446512520313263, "learning_rate": 1e-06, "loss": 0.0048, "step": 1274 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.1945226937532425, "epoch": 2.043269230769231, "grad_norm": 324.0447082519531, "learning_rate": 1e-06, "loss": 0.0717, "step": 1275 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.19547558575868607, "epoch": 2.0448717948717947, "grad_norm": 755.1400756835938, "learning_rate": 1e-06, "loss": 0.0474, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3276.0, "completions/mean_length": 1617.3359375, "completions/mean_terminated_length": 1353.1212158203125, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "entropy": 0.19297361373901367, "epoch": 2.046474358974359, "frac_reward_zero_std": 0.34375, "grad_norm": 0.018968136981129646, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 905556918.0, "reward": 0.2657776474952698, "reward_std": 0.0448925644159317, "rewards/progression_diversity/mean": -0.00036140886368229985, "rewards/progression_diversity/std": 0.004768828861415386, "rewards/symbolic_reward_accuracy/mean": 0.115234375, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.6613280773162842, "rewards/symbolic_reward_partial_score/std": 0.21297021210193634, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0336802005767822, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 2.5898587703704834, "step": 1277 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.18827137351036072, "epoch": 2.048076923076923, "grad_norm": 0.011503035202622414, "learning_rate": 1e-06, "loss": 0.0706, "step": 1278 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.19312509894371033, "epoch": 2.0496794871794872, "grad_norm": 0.009355909191071987, "learning_rate": 1e-06, "loss": 0.0425, "step": 1279 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.18875424563884735, "epoch": 2.051282051282051, "grad_norm": 0.016550183296203613, "learning_rate": 1e-06, "loss": 0.0553, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 1660.9140625, "completions/mean_terminated_length": 1456.831787109375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.19483474642038345, "epoch": 2.0528846153846154, "frac_reward_zero_std": 0.3125, "grad_norm": 0.02122480981051922, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 907271450.0, "reward": 0.33715343475341797, "reward_std": 0.034450314939022064, "rewards/progression_diversity/mean": -0.00047867521061562, "rewards/progression_diversity/std": 0.004741494078189135, "rewards/symbolic_reward_accuracy/mean": 0.216796875, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.69482421875, "rewards/symbolic_reward_partial_score/std": 0.21679222583770752, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0364654064178467, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.3800773620605469, "step": 1281 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.1928672194480896, "epoch": 2.0544871794871793, "grad_norm": 1.5037897825241089, "learning_rate": 1e-06, "loss": 0.0278, "step": 1282 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.1988951489329338, "epoch": 2.0560897435897436, "grad_norm": 0.024000803008675575, "learning_rate": 1e-06, "loss": 0.0196, "step": 1283 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.19203483313322067, "epoch": 2.0576923076923075, "grad_norm": 0.019834544509649277, "learning_rate": 1e-06, "loss": 0.0836, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5436.0, "completions/mean_length": 1464.345703125, "completions/mean_terminated_length": 1405.8372802734375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "entropy": 0.20045289397239685, "epoch": 2.059294871794872, "frac_reward_zero_std": 0.40625, "grad_norm": 28.706775665283203, "learning_rate": 1e-06, "loss": 0.0216, "num_tokens": 908900203.0, "reward": 0.30004429817199707, "reward_std": 0.05181866139173508, "rewards/progression_diversity/mean": -0.0009444843744859099, "rewards/progression_diversity/std": 0.011486358940601349, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.6928874254226685, "rewards/symbolic_reward_partial_score/std": 0.2050057202577591, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0416698455810547, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.41028910875320435, "step": 1285 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2051420360803604, "epoch": 2.0608974358974357, "grad_norm": 0.01245288085192442, "learning_rate": 1e-06, "loss": -0.0077, "step": 1286 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2012268602848053, "epoch": 2.0625, "grad_norm": 0.025453390553593636, "learning_rate": 1e-06, "loss": 0.0173, "step": 1287 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.20480038225650787, "epoch": 2.064102564102564, "grad_norm": 0.025336798280477524, "learning_rate": 1e-06, "loss": 0.0103, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3935.0, "completions/mean_length": 1418.521484375, "completions/mean_terminated_length": 1389.23486328125, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.2055332362651825, "epoch": 2.065705128205128, "frac_reward_zero_std": 0.5, "grad_norm": 157.3429412841797, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 910496934.0, "reward": 0.26742783188819885, "reward_std": 0.019551731646060944, "rewards/progression_diversity/mean": -0.0008687502122484148, "rewards/progression_diversity/std": 0.010150086134672165, "rewards/symbolic_reward_accuracy/mean": 0.09375, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.7046061158180237, "rewards/symbolic_reward_partial_score/std": 0.19600564241409302, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0424489974975586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.4152454733848572, "step": 1289 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2030770629644394, "epoch": 2.0673076923076925, "grad_norm": 0.015944253653287888, "learning_rate": 1e-06, "loss": -0.0017, "step": 1290 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.20718741416931152, "epoch": 2.0689102564102564, "grad_norm": 0.014883228577673435, "learning_rate": 1e-06, "loss": 0.0054, "step": 1291 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.20744304358959198, "epoch": 2.0705128205128207, "grad_norm": 0.014635228551924229, "learning_rate": 1e-06, "loss": -0.0017, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3403.0, "completions/mean_length": 1773.353515625, "completions/mean_terminated_length": 1600.1048583984375, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.18955910950899124, "epoch": 2.0721153846153846, "frac_reward_zero_std": 0.375, "grad_norm": 1073.1669921875, "learning_rate": 1e-06, "loss": 0.058, "num_tokens": 912368443.0, "reward": 0.2491791993379593, "reward_std": 0.03544043377041817, "rewards/progression_diversity/mean": -0.0005382450763136148, "rewards/progression_diversity/std": 0.008408893831074238, "rewards/symbolic_reward_accuracy/mean": 0.08203125, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.670458972454071, "rewards/symbolic_reward_partial_score/std": 0.19899418950080872, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.036078691482544, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.4867057800292969, "step": 1293 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.19430772960186005, "epoch": 2.073717948717949, "grad_norm": 0.013100269250571728, "learning_rate": 1e-06, "loss": -0.0115, "step": 1294 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.19107140600681305, "epoch": 2.0753205128205128, "grad_norm": 0.012087260372936726, "learning_rate": 1e-06, "loss": 0.0252, "step": 1295 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.1892467811703682, "epoch": 2.076923076923077, "grad_norm": 0.01232385728508234, "learning_rate": 1e-06, "loss": 0.0455, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3224.0, "completions/mean_length": 1561.599609375, "completions/mean_terminated_length": 1444.8878173828125, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.1971185952425003, "epoch": 2.078525641025641, "frac_reward_zero_std": 0.34375, "grad_norm": 33.13689422607422, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 914118782.0, "reward": 0.27982568740844727, "reward_std": 0.04554954543709755, "rewards/progression_diversity/mean": -0.00034400858567096293, "rewards/progression_diversity/std": 0.0038654166273772717, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.6931803226470947, "rewards/symbolic_reward_partial_score/std": 0.18553432822227478, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.037705659866333, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 732.0, "sampling/sampling_logp_difference/mean": 1.3978204727172852, "step": 1297 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.1944933459162712, "epoch": 2.0801282051282053, "grad_norm": 0.015022831037640572, "learning_rate": 1e-06, "loss": 0.0251, "step": 1298 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.19650224596261978, "epoch": 2.081730769230769, "grad_norm": 0.022843046113848686, "learning_rate": 1e-06, "loss": -0.0049, "step": 1299 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.19136834889650345, "epoch": 2.0833333333333335, "grad_norm": 0.02248123101890087, "learning_rate": 1e-06, "loss": 0.0554, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3264.0, "completions/mean_length": 1438.8671875, "completions/mean_terminated_length": 1409.620361328125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.2071860283613205, "epoch": 2.0849358974358974, "frac_reward_zero_std": 0.375, "grad_norm": 0.022805871441960335, "learning_rate": 1e-06, "loss": 0.0063, "num_tokens": 915655274.0, "reward": 0.4092583954334259, "reward_std": 0.04942026361823082, "rewards/progression_diversity/mean": -0.0004308145144023001, "rewards/progression_diversity/std": 0.00391729548573494, "rewards/symbolic_reward_accuracy/mean": 0.302734375, "rewards/symbolic_reward_accuracy/std": 0.45989060401916504, "rewards/symbolic_reward_partial_score/mean": 0.7593912482261658, "rewards/symbolic_reward_partial_score/std": 0.2201661616563797, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0432233810424805, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.34774258732795715, "step": 1301 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.20583676546812057, "epoch": 2.0865384615384617, "grad_norm": 0.010654095560312271, "learning_rate": 1e-06, "loss": -0.0036, "step": 1302 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.21007215976715088, "epoch": 2.0881410256410255, "grad_norm": 0.018026195466518402, "learning_rate": 1e-06, "loss": -0.002, "step": 1303 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.20896370708942413, "epoch": 2.08974358974359, "grad_norm": 0.018298866227269173, "learning_rate": 1e-06, "loss": 0.0288, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 1537.986328125, "completions/mean_terminated_length": 1450.4853515625, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "entropy": 0.19623306393623352, "epoch": 2.0913461538461537, "frac_reward_zero_std": 0.25, "grad_norm": 455.7239685058594, "learning_rate": 1e-06, "loss": 0.0324, "num_tokens": 917384147.0, "reward": 0.27777591347694397, "reward_std": 0.05723512917757034, "rewards/progression_diversity/mean": -0.00024088792270049453, "rewards/progression_diversity/std": 0.0034442872274667025, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.6778808832168579, "rewards/symbolic_reward_partial_score/std": 0.21165545284748077, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.040327787399292, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.7170519232749939, "step": 1305 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.20050545781850815, "epoch": 2.092948717948718, "grad_norm": 0.024372782558202744, "learning_rate": 1e-06, "loss": 0.0046, "step": 1306 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.20225122570991516, "epoch": 2.094551282051282, "grad_norm": 0.017451845109462738, "learning_rate": 1e-06, "loss": -0.0046, "step": 1307 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.19906309247016907, "epoch": 2.0961538461538463, "grad_norm": 0.015806974843144417, "learning_rate": 1e-06, "loss": 0.0063, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 1569.970703125, "completions/mean_terminated_length": 1569.970703125, "completions/min_length": 474.0, "completions/min_terminated_length": 474.0, "entropy": 0.2078276202082634, "epoch": 2.09775641025641, "frac_reward_zero_std": 0.34375, "grad_norm": 0.030513443052768707, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 918996132.0, "reward": 0.3342166543006897, "reward_std": 0.05483569577336311, "rewards/progression_diversity/mean": -0.00021007427130825818, "rewards/progression_diversity/std": 0.001884151715785265, "rewards/symbolic_reward_accuracy/mean": 0.203125, "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, "rewards/symbolic_reward_partial_score/mean": 0.7078125476837158, "rewards/symbolic_reward_partial_score/std": 0.2104804515838623, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0436408519744873, "sampling/importance_sampling_ratio/min": 1.435761191714846e-06, "sampling/sampling_logp_difference/max": 13.453815460205078, "sampling/sampling_logp_difference/mean": 0.09233014285564423, "step": 1309 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.20950071513652802, "epoch": 2.0993589743589745, "grad_norm": 0.01937035657465458, "learning_rate": 1e-06, "loss": -0.0061, "step": 1310 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.21072106808423996, "epoch": 2.1009615384615383, "grad_norm": 0.019583554938435555, "learning_rate": 1e-06, "loss": -0.0035, "step": 1311 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.20755978673696518, "epoch": 2.1025641025641026, "grad_norm": 0.017628004774451256, "learning_rate": 1e-06, "loss": 0.013, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2935.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 1462.236328125, "completions/mean_terminated_length": 1462.236328125, "completions/min_length": 465.0, "completions/min_terminated_length": 465.0, "entropy": 0.20946191251277924, "epoch": 2.1041666666666665, "frac_reward_zero_std": 0.3125, "grad_norm": 0.016548819839954376, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 920573229.0, "reward": 0.3093579113483429, "reward_std": 0.030774272978305817, "rewards/progression_diversity/mean": -0.0002441601827740669, "rewards/progression_diversity/std": 0.0027843969874083996, "rewards/symbolic_reward_accuracy/mean": 0.146484375, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.7382323741912842, "rewards/symbolic_reward_partial_score/std": 0.1788843870162964, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044257640838623, "sampling/importance_sampling_ratio/min": 0.00013327680062502623, "sampling/sampling_logp_difference/max": 8.92308235168457, "sampling/sampling_logp_difference/mean": 0.09280422329902649, "step": 1313 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2082810178399086, "epoch": 2.105769230769231, "grad_norm": 0.018022574484348297, "learning_rate": 1e-06, "loss": 0.0031, "step": 1314 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.20982997119426727, "epoch": 2.1073717948717947, "grad_norm": 0.01984243281185627, "learning_rate": 1e-06, "loss": -0.004, "step": 1315 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.21134591847658157, "epoch": 2.108974358974359, "grad_norm": 0.025971846655011177, "learning_rate": 1e-06, "loss": 0.0064, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3383.0, "completions/mean_length": 1503.8203125, "completions/mean_terminated_length": 1474.7005615234375, "completions/min_length": 454.0, "completions/min_terminated_length": 454.0, "entropy": 0.20815780013799667, "epoch": 2.110576923076923, "frac_reward_zero_std": 0.21875, "grad_norm": 0.027938535436987877, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 922209697.0, "reward": 0.3024898171424866, "reward_std": 0.08157630264759064, "rewards/progression_diversity/mean": -0.0005339247290976346, "rewards/progression_diversity/std": 0.009711179882287979, "rewards/symbolic_reward_accuracy/mean": 0.162109375, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.6847493648529053, "rewards/symbolic_reward_partial_score/std": 0.21822915971279144, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0429041385650635, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.4195671081542969, "step": 1317 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21150559186935425, "epoch": 2.1121794871794872, "grad_norm": 0.0272557120770216, "learning_rate": 1e-06, "loss": -0.0185, "step": 1318 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.20635531842708588, "epoch": 2.113782051282051, "grad_norm": 0.0219330545514822, "learning_rate": 1e-06, "loss": 0.0242, "step": 1319 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.20883511006832123, "epoch": 2.1153846153846154, "grad_norm": 344.6760559082031, "learning_rate": 1e-06, "loss": 0.0317, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3218.0, "completions/mean_length": 1496.935546875, "completions/mean_terminated_length": 1467.8023681640625, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.20407634973526, "epoch": 2.1169871794871793, "frac_reward_zero_std": 0.40625, "grad_norm": 0.02108251117169857, "learning_rate": 1e-06, "loss": 0.0286, "num_tokens": 923895312.0, "reward": 0.36836761236190796, "reward_std": 0.025770656764507294, "rewards/progression_diversity/mean": -0.0001522178645245731, "rewards/progression_diversity/std": 0.0016012955456972122, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.7357096076011658, "rewards/symbolic_reward_partial_score/std": 0.22949159145355225, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0432062149047852, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 728.0, "sampling/sampling_logp_difference/mean": 0.1866808831691742, "step": 1321 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.20261122286319733, "epoch": 2.1185897435897436, "grad_norm": 0.024469273164868355, "learning_rate": 1e-06, "loss": 0.006, "step": 1322 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.20310993492603302, "epoch": 2.1201923076923075, "grad_norm": 0.023724382743239403, "learning_rate": 1e-06, "loss": -0.0056, "step": 1323 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.20938430726528168, "epoch": 2.121794871794872, "grad_norm": 0.012969693168997765, "learning_rate": 1e-06, "loss": 0.0018, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 1410.677734375, "completions/mean_terminated_length": 1410.677734375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "entropy": 0.2097427099943161, "epoch": 2.123397435897436, "frac_reward_zero_std": 0.375, "grad_norm": 0.01685580424964428, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 925466827.0, "reward": 0.38571590185165405, "reward_std": 0.03916021063923836, "rewards/progression_diversity/mean": -0.00018581100448500365, "rewards/progression_diversity/std": 0.002618141006678343, "rewards/symbolic_reward_accuracy/mean": 0.255859375, "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, "rewards/symbolic_reward_partial_score/mean": 0.7740071415901184, "rewards/symbolic_reward_partial_score/std": 0.18180802464485168, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0450756549835205, "sampling/importance_sampling_ratio/min": 1.69496820490167e-06, "sampling/sampling_logp_difference/max": 13.287846565246582, "sampling/sampling_logp_difference/mean": 0.0947427749633789, "step": 1325 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2110649198293686, "epoch": 2.125, "grad_norm": 0.019374966621398926, "learning_rate": 1e-06, "loss": 0.0141, "step": 1326 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.20897484570741653, "epoch": 2.126602564102564, "grad_norm": 0.011842702515423298, "learning_rate": 1e-06, "loss": -0.0107, "step": 1327 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.21269075572490692, "epoch": 2.128205128205128, "grad_norm": 0.01617637649178505, "learning_rate": 1e-06, "loss": 0.007, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 1554.6484375, "completions/mean_terminated_length": 1467.24560546875, "completions/min_length": 448.0, "completions/min_terminated_length": 448.0, "entropy": 0.2093176394701004, "epoch": 2.1298076923076925, "frac_reward_zero_std": 0.59375, "grad_norm": 3.476539134979248, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 927113287.0, "reward": 0.372197687625885, "reward_std": 0.024160441011190414, "rewards/progression_diversity/mean": -0.0009393933578394353, "rewards/progression_diversity/std": 0.014015908353030682, "rewards/symbolic_reward_accuracy/mean": 0.244140625, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.7543619871139526, "rewards/symbolic_reward_partial_score/std": 0.20396019518375397, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041762351989746, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.016873836517334, "step": 1329 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.21161337196826935, "epoch": 2.1314102564102564, "grad_norm": 0.009917390532791615, "learning_rate": 1e-06, "loss": 0.025, "step": 1330 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.21297961473464966, "epoch": 2.1330128205128207, "grad_norm": 0.008365034125745296, "learning_rate": 1e-06, "loss": 0.0207, "step": 1331 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.21263594925403595, "epoch": 2.1346153846153846, "grad_norm": 0.020000530406832695, "learning_rate": 1e-06, "loss": 0.001, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2631.0, "completions/max_terminated_length": 2631.0, "completions/mean_length": 1392.521484375, "completions/mean_terminated_length": 1392.521484375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.21844053268432617, "epoch": 2.136217948717949, "frac_reward_zero_std": 0.5, "grad_norm": 0.02576032653450966, "learning_rate": 1e-06, "loss": -0.0026, "num_tokens": 928662738.0, "reward": 0.39745038747787476, "reward_std": 0.04422683268785477, "rewards/progression_diversity/mean": -8.176280243787915e-05, "rewards/progression_diversity/std": 0.001850081025622785, "rewards/symbolic_reward_accuracy/mean": 0.2890625, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.7467122673988342, "rewards/symbolic_reward_partial_score/std": 0.21178719401359558, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0465729236602783, "sampling/importance_sampling_ratio/min": 9.539062739349902e-05, "sampling/sampling_logp_difference/max": 9.257530212402344, "sampling/sampling_logp_difference/mean": 0.09742757678031921, "step": 1333 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.22133174538612366, "epoch": 2.1378205128205128, "grad_norm": 0.012168043293058872, "learning_rate": 1e-06, "loss": -0.0017, "step": 1334 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.22107722610235214, "epoch": 2.139423076923077, "grad_norm": 0.010279431007802486, "learning_rate": 1e-06, "loss": -0.0065, "step": 1335 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.21761613339185715, "epoch": 2.141025641025641, "grad_norm": 0.008288303390145302, "learning_rate": 1e-06, "loss": 0.0063, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 1633.904296875, "completions/mean_terminated_length": 1605.0391845703125, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.20767099410295486, "epoch": 2.1426282051282053, "frac_reward_zero_std": 0.25, "grad_norm": 0.021908177062869072, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 930332929.0, "reward": 0.27803462743759155, "reward_std": 0.053543590009212494, "rewards/progression_diversity/mean": -0.0002496470115147531, "rewards/progression_diversity/std": 0.0035664818715304136, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.661816418170929, "rewards/symbolic_reward_partial_score/std": 0.1937265247106552, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0444786548614502, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.3441740572452545, "step": 1337 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.20627369731664658, "epoch": 2.144230769230769, "grad_norm": 0.026881849393248558, "learning_rate": 1e-06, "loss": 0.0346, "step": 1338 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21029096841812134, "epoch": 2.1458333333333335, "grad_norm": 0.015624100342392921, "learning_rate": 1e-06, "loss": 0.0001, "step": 1339 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.21201331168413162, "epoch": 2.1474358974358974, "grad_norm": 0.023685337975621223, "learning_rate": 1e-06, "loss": -0.0102, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2974.0, "completions/max_terminated_length": 2974.0, "completions/mean_length": 1587.693359375, "completions/mean_terminated_length": 1587.693359375, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.20493042469024658, "epoch": 2.1490384615384617, "frac_reward_zero_std": 0.375, "grad_norm": 0.022984595969319344, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 931986948.0, "reward": 0.4041173458099365, "reward_std": 0.039462070912122726, "rewards/progression_diversity/mean": -0.0008660133462399244, "rewards/progression_diversity/std": 0.006552197504788637, "rewards/symbolic_reward_accuracy/mean": 0.296875, "rewards/symbolic_reward_accuracy/std": 0.45732781291007996, "rewards/symbolic_reward_partial_score/mean": 0.7533365488052368, "rewards/symbolic_reward_partial_score/std": 0.21289843320846558, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0439233779907227, "sampling/importance_sampling_ratio/min": 0.0010176009964197874, "sampling/sampling_logp_difference/max": 6.890307426452637, "sampling/sampling_logp_difference/mean": 0.0925874412059784, "step": 1341 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.20170185714960098, "epoch": 2.1506410256410255, "grad_norm": 0.022239655256271362, "learning_rate": 1e-06, "loss": -0.0016, "step": 1342 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2027420625090599, "epoch": 2.15224358974359, "grad_norm": 0.01715942844748497, "learning_rate": 1e-06, "loss": 0.0027, "step": 1343 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.20686108618974686, "epoch": 2.1538461538461537, "grad_norm": 0.010147986933588982, "learning_rate": 1e-06, "loss": 0.0076, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3213.0, "completions/max_terminated_length": 3213.0, "completions/mean_length": 1698.078125, "completions/mean_terminated_length": 1698.078125, "completions/min_length": 559.0, "completions/min_terminated_length": 559.0, "entropy": 0.2128048539161682, "epoch": 2.155448717948718, "frac_reward_zero_std": 0.40625, "grad_norm": 0.016743527725338936, "learning_rate": 1e-06, "loss": -0.0082, "num_tokens": 933690460.0, "reward": 0.3048563003540039, "reward_std": 0.04834011197090149, "rewards/progression_diversity/mean": -0.0007014497532509267, "rewards/progression_diversity/std": 0.00812255684286356, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.7115234136581421, "rewards/symbolic_reward_partial_score/std": 0.18424317240715027, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0451080799102783, "sampling/importance_sampling_ratio/min": 6.933895201655105e-05, "sampling/sampling_logp_difference/max": 9.57650375366211, "sampling/sampling_logp_difference/mean": 0.09451892971992493, "step": 1345 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.21316298097372055, "epoch": 2.157051282051282, "grad_norm": 0.013251153752207756, "learning_rate": 1e-06, "loss": 0.0005, "step": 1346 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.20967798680067062, "epoch": 2.1586538461538463, "grad_norm": 0.015155055560171604, "learning_rate": 1e-06, "loss": 0.0159, "step": 1347 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2121565341949463, "epoch": 2.16025641025641, "grad_norm": 0.024950312450528145, "learning_rate": 1e-06, "loss": -0.0028, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3127.0, "completions/mean_length": 1755.244140625, "completions/mean_terminated_length": 1697.8765869140625, "completions/min_length": 394.0, "completions/min_terminated_length": 394.0, "entropy": 0.20723742246627808, "epoch": 2.1618589743589745, "frac_reward_zero_std": 0.375, "grad_norm": 0.02118254266679287, "learning_rate": 1e-06, "loss": 0.0059, "num_tokens": 935479065.0, "reward": 0.4168354868888855, "reward_std": 0.05675492808222771, "rewards/progression_diversity/mean": -0.000535505183506757, "rewards/progression_diversity/std": 0.0042114765383303165, "rewards/symbolic_reward_accuracy/mean": 0.32421875, "rewards/symbolic_reward_accuracy/std": 0.4685399830341339, "rewards/symbolic_reward_partial_score/mean": 0.7423340082168579, "rewards/symbolic_reward_partial_score/std": 0.23071099817752838, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0427265167236328, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.5817426443099976, "step": 1349 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.20122719556093216, "epoch": 2.1634615384615383, "grad_norm": 0.01896592602133751, "learning_rate": 1e-06, "loss": 0.0248, "step": 1350 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.20486315339803696, "epoch": 2.1650641025641026, "grad_norm": 0.01839040406048298, "learning_rate": 1e-06, "loss": 0.0417, "step": 1351 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.20524092018604279, "epoch": 2.1666666666666665, "grad_norm": 0.013213125988841057, "learning_rate": 1e-06, "loss": -0.0134, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3402.0, "completions/mean_length": 1791.140625, "completions/mean_terminated_length": 1705.1317138671875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.1993129849433899, "epoch": 2.168269230769231, "frac_reward_zero_std": 0.21875, "grad_norm": 0.018554260954260826, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 937293713.0, "reward": 0.3082357347011566, "reward_std": 0.05308583378791809, "rewards/progression_diversity/mean": -0.0016233250498771667, "rewards/progression_diversity/std": 0.010440926998853683, "rewards/symbolic_reward_accuracy/mean": 0.1640625, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.70068359375, "rewards/symbolic_reward_partial_score/std": 0.1991189867258072, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0414732694625854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.7726681232452393, "step": 1353 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2003878429532051, "epoch": 2.1698717948717947, "grad_norm": 0.024082843214273453, "learning_rate": 1e-06, "loss": 0.0203, "step": 1354 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.20279623568058014, "epoch": 2.171474358974359, "grad_norm": 0.02481573261320591, "learning_rate": 1e-06, "loss": 0.0156, "step": 1355 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.20432738959789276, "epoch": 2.173076923076923, "grad_norm": 0.023944402113556862, "learning_rate": 1e-06, "loss": 0.0234, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4818.0, "completions/mean_length": 1645.9921875, "completions/mean_terminated_length": 1617.150634765625, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "entropy": 0.20042243599891663, "epoch": 2.1746794871794872, "frac_reward_zero_std": 0.375, "grad_norm": 0.025058995932340622, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 939053501.0, "reward": 0.3049207925796509, "reward_std": 0.029460538178682327, "rewards/progression_diversity/mean": -0.0020646725315600634, "rewards/progression_diversity/std": 0.010834318585693836, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.7085286378860474, "rewards/symbolic_reward_partial_score/std": 0.19462068378925323, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0425053834915161, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.37059780955314636, "step": 1357 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.19977743178606033, "epoch": 2.176282051282051, "grad_norm": 0.01753915846347809, "learning_rate": 1e-06, "loss": -0.0028, "step": 1358 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.19901012629270554, "epoch": 2.1778846153846154, "grad_norm": 0.020039677619934082, "learning_rate": 1e-06, "loss": 0.0348, "step": 1359 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.1974909007549286, "epoch": 2.1794871794871793, "grad_norm": 0.016383672133088112, "learning_rate": 1e-06, "loss": 0.0079, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3294.0, "completions/mean_length": 1614.376953125, "completions/mean_terminated_length": 1556.4569091796875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 0.2029455080628395, "epoch": 2.1810897435897436, "frac_reward_zero_std": 0.125, "grad_norm": 80.58993530273438, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 940647406.0, "reward": 0.41535690426826477, "reward_std": 0.07679130136966705, "rewards/progression_diversity/mean": -0.0019082010257989168, "rewards/progression_diversity/std": 0.01008455641567707, "rewards/symbolic_reward_accuracy/mean": 0.310546875, "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, "rewards/symbolic_reward_partial_score/mean": 0.7647949457168579, "rewards/symbolic_reward_partial_score/std": 0.22581757605075836, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.042673110961914, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.7193973660469055, "step": 1361 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.203001469373703, "epoch": 2.1826923076923075, "grad_norm": 0.021764498203992844, "learning_rate": 1e-06, "loss": 0.0228, "step": 1362 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2056587189435959, "epoch": 2.184294871794872, "grad_norm": 0.020050769671797752, "learning_rate": 1e-06, "loss": -0.0023, "step": 1363 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2047322392463684, "epoch": 2.185897435897436, "grad_norm": 0.02470891922712326, "learning_rate": 1e-06, "loss": 0.0074, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3347.0, "completions/mean_length": 1624.029296875, "completions/mean_terminated_length": 1537.035400390625, "completions/min_length": 432.0, "completions/min_terminated_length": 432.0, "entropy": 0.19499259442090988, "epoch": 2.1875, "frac_reward_zero_std": 0.1875, "grad_norm": 0.025656770914793015, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 942358845.0, "reward": 0.33442234992980957, "reward_std": 0.039018601179122925, "rewards/progression_diversity/mean": -0.0016129279974848032, "rewards/progression_diversity/std": 0.011494509875774384, "rewards/symbolic_reward_accuracy/mean": 0.1953125, "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, "rewards/symbolic_reward_partial_score/mean": 0.7261229753494263, "rewards/symbolic_reward_partial_score/std": 0.2054896205663681, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0397260189056396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.8769243955612183, "step": 1365 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.19767170399427414, "epoch": 2.189102564102564, "grad_norm": 0.015550960786640644, "learning_rate": 1e-06, "loss": 0.0246, "step": 1366 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.19075892865657806, "epoch": 2.190705128205128, "grad_norm": 0.024761386215686798, "learning_rate": 1e-06, "loss": 0.019, "step": 1367 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.1886235773563385, "epoch": 2.1923076923076925, "grad_norm": 0.015679223462939262, "learning_rate": 1e-06, "loss": 0.0347, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3384.0, "completions/mean_length": 1550.146484375, "completions/mean_terminated_length": 1521.117431640625, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.1886545717716217, "epoch": 2.1939102564102564, "frac_reward_zero_std": 0.34375, "grad_norm": 0.02213294804096222, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 944050936.0, "reward": 0.3354012668132782, "reward_std": 0.0484355203807354, "rewards/progression_diversity/mean": -0.0004021337954327464, "rewards/progression_diversity/std": 0.0033471970818936825, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.6967936158180237, "rewards/symbolic_reward_partial_score/std": 0.24216388165950775, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0405962467193604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.31618547439575195, "step": 1369 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.19220811873674393, "epoch": 2.1955128205128207, "grad_norm": 0.017105696722865105, "learning_rate": 1e-06, "loss": 0.0032, "step": 1370 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.18837008625268936, "epoch": 2.1971153846153846, "grad_norm": 0.020541273057460785, "learning_rate": 1e-06, "loss": -0.0059, "step": 1371 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.19123337417840958, "epoch": 2.198717948717949, "grad_norm": 0.016050660982728004, "learning_rate": 1e-06, "loss": 0.0019, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 1501.89453125, "completions/mean_terminated_length": 1443.533447265625, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "entropy": 0.1963193044066429, "epoch": 2.2003205128205128, "frac_reward_zero_std": 0.1875, "grad_norm": 839.10888671875, "learning_rate": 1e-06, "loss": 0.0385, "num_tokens": 945680274.0, "reward": 0.3143922984600067, "reward_std": 0.04647096246480942, "rewards/progression_diversity/mean": -0.0016889558173716068, "rewards/progression_diversity/std": 0.009619291871786118, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.6782389283180237, "rewards/symbolic_reward_partial_score/std": 0.20476947724819183, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0405044555664062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.8936862349510193, "step": 1373 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.19663988798856735, "epoch": 2.201923076923077, "grad_norm": 0.023362474516034126, "learning_rate": 1e-06, "loss": 0.0048, "step": 1374 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.19550593197345734, "epoch": 2.203525641025641, "grad_norm": 0.01925286464393139, "learning_rate": 1e-06, "loss": 0.0164, "step": 1375 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.198850616812706, "epoch": 2.2051282051282053, "grad_norm": 0.012360334396362305, "learning_rate": 1e-06, "loss": 0.0038, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10325.0, "completions/mean_length": 1557.294921875, "completions/mean_terminated_length": 1499.151123046875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "entropy": 0.1966118887066841, "epoch": 2.206730769230769, "frac_reward_zero_std": 0.34375, "grad_norm": 0.030109280720353127, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 947367161.0, "reward": 0.2886817753314972, "reward_std": 0.03847378119826317, "rewards/progression_diversity/mean": -0.0009640345815569162, "rewards/progression_diversity/std": 0.006367819383740425, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.6979818344116211, "rewards/symbolic_reward_partial_score/std": 0.1839800328016281, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0392873287200928, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.0182666778564453, "step": 1377 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.1928621083498001, "epoch": 2.2083333333333335, "grad_norm": 0.01684403605759144, "learning_rate": 1e-06, "loss": 0.0053, "step": 1378 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.19299018383026123, "epoch": 2.2099358974358974, "grad_norm": 0.013669331558048725, "learning_rate": 1e-06, "loss": 0.0245, "step": 1379 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.19235268235206604, "epoch": 2.2115384615384617, "grad_norm": 0.013778350315988064, "learning_rate": 1e-06, "loss": 0.0121, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3313.0, "completions/mean_length": 1556.845703125, "completions/mean_terminated_length": 1527.8297119140625, "completions/min_length": 408.0, "completions/min_terminated_length": 408.0, "entropy": 0.19688106328248978, "epoch": 2.2131410256410255, "frac_reward_zero_std": 0.28125, "grad_norm": 0.03251493349671364, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 949020410.0, "reward": 0.26278871297836304, "reward_std": 0.05675513297319412, "rewards/progression_diversity/mean": -0.00042738509364426136, "rewards/progression_diversity/std": 0.004007916897535324, "rewards/symbolic_reward_accuracy/mean": 0.111328125, "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, "rewards/symbolic_reward_partial_score/mean": 0.6539713144302368, "rewards/symbolic_reward_partial_score/std": 0.2113257795572281, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0415072441101074, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.9642093777656555, "step": 1381 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2019263282418251, "epoch": 2.21474358974359, "grad_norm": 0.013695978559553623, "learning_rate": 1e-06, "loss": 0.0128, "step": 1382 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.20185761898756027, "epoch": 2.2163461538461537, "grad_norm": 0.016754373908042908, "learning_rate": 1e-06, "loss": -0.0021, "step": 1383 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.20480705797672272, "epoch": 2.217948717948718, "grad_norm": 0.013411487452685833, "learning_rate": 1e-06, "loss": 0.0017, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3598.0, "completions/mean_length": 1521.16015625, "completions/mean_terminated_length": 1433.5599365234375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.21776793152093887, "epoch": 2.219551282051282, "frac_reward_zero_std": 0.15625, "grad_norm": 0.022128723561763763, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 950676124.0, "reward": 0.3249192237854004, "reward_std": 0.05313926190137863, "rewards/progression_diversity/mean": -0.0017315060831606388, "rewards/progression_diversity/std": 0.009827325120568275, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.7178873419761658, "rewards/symbolic_reward_partial_score/std": 0.20977161824703217, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0407921075820923, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 2.0792016983032227, "step": 1385 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.20703934133052826, "epoch": 2.2211538461538463, "grad_norm": 0.028047246858477592, "learning_rate": 1e-06, "loss": 0.0584, "step": 1386 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.21588321775197983, "epoch": 2.22275641025641, "grad_norm": 0.017534522339701653, "learning_rate": 1e-06, "loss": -0.0009, "step": 1387 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2086542472243309, "epoch": 2.2243589743589745, "grad_norm": 0.015552827157080173, "learning_rate": 1e-06, "loss": 0.0135, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3854.0, "completions/mean_length": 1460.521484375, "completions/mean_terminated_length": 1401.9981689453125, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.2114865928888321, "epoch": 2.2259615384615383, "frac_reward_zero_std": 0.34375, "grad_norm": 393.90008544921875, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 952301287.0, "reward": 0.3778611421585083, "reward_std": 0.04598058760166168, "rewards/progression_diversity/mean": -0.000995249254629016, "rewards/progression_diversity/std": 0.006696970667690039, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.7686848640441895, "rewards/symbolic_reward_partial_score/std": 0.1856938898563385, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0438477993011475, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.968491792678833, "step": 1389 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22220055013895035, "epoch": 2.2275641025641026, "grad_norm": 0.007562046870589256, "learning_rate": 1e-06, "loss": -0.0002, "step": 1390 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.21346572041511536, "epoch": 2.2291666666666665, "grad_norm": 0.02975591830909252, "learning_rate": 1e-06, "loss": 0.0023, "step": 1391 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.20686539262533188, "epoch": 2.230769230769231, "grad_norm": 0.023594049736857414, "learning_rate": 1e-06, "loss": 0.0378, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 1543.576171875, "completions/mean_terminated_length": 1456.108154296875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 0.21457184851169586, "epoch": 2.2323717948717947, "frac_reward_zero_std": 0.375, "grad_norm": 0.024636728689074516, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 953897950.0, "reward": 0.3451923131942749, "reward_std": 0.03799974173307419, "rewards/progression_diversity/mean": -0.0012781170662492514, "rewards/progression_diversity/std": 0.014912668615579605, "rewards/symbolic_reward_accuracy/mean": 0.205078125, "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, "rewards/symbolic_reward_partial_score/mean": 0.7424805164337158, "rewards/symbolic_reward_partial_score/std": 0.19341571629047394, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0439865589141846, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.7786634564399719, "step": 1393 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.20991724729537964, "epoch": 2.233974358974359, "grad_norm": 0.018171893432736397, "learning_rate": 1e-06, "loss": 0.0823, "step": 1394 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.21980993449687958, "epoch": 2.235576923076923, "grad_norm": 0.018304111436009407, "learning_rate": 1e-06, "loss": -0.0022, "step": 1395 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21467426419258118, "epoch": 2.2371794871794872, "grad_norm": 0.011014263145625591, "learning_rate": 1e-06, "loss": -0.0085, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 11943.0, "completions/mean_length": 1486.80859375, "completions/mean_terminated_length": 1428.3883056640625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.20776310563087463, "epoch": 2.238782051282051, "frac_reward_zero_std": 0.46875, "grad_norm": 0.04726417362689972, "learning_rate": 1e-06, "loss": 0.0351, "num_tokens": 955540620.0, "reward": 0.279166579246521, "reward_std": 0.017867092043161392, "rewards/progression_diversity/mean": -0.001313370536081493, "rewards/progression_diversity/std": 0.011600782163441181, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.6819010972976685, "rewards/symbolic_reward_partial_score/std": 0.2057180553674698, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0435209274291992, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.23488587141036987, "step": 1397 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.20805874466896057, "epoch": 2.2403846153846154, "grad_norm": 0.020725863054394722, "learning_rate": 1e-06, "loss": -0.0034, "step": 1398 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2029411420226097, "epoch": 2.2419871794871793, "grad_norm": 0.008639580570161343, "learning_rate": 1e-06, "loss": 0.0162, "step": 1399 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2050759717822075, "epoch": 2.2435897435897436, "grad_norm": 0.011195927858352661, "learning_rate": 1e-06, "loss": 0.0212, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 1376.677734375, "completions/mean_terminated_length": 1347.3092041015625, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.21754567325115204, "epoch": 2.2451923076923075, "frac_reward_zero_std": 0.46875, "grad_norm": 0.01782279461622238, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 957101943.0, "reward": 0.3711217939853668, "reward_std": 0.0564521960914135, "rewards/progression_diversity/mean": -0.0006168890977278352, "rewards/progression_diversity/std": 0.005630532745271921, "rewards/symbolic_reward_accuracy/mean": 0.271484375, "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, "rewards/symbolic_reward_partial_score/mean": 0.6947753429412842, "rewards/symbolic_reward_partial_score/std": 0.22683373093605042, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0451213121414185, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.8511496782302856, "step": 1401 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.21188554912805557, "epoch": 2.246794871794872, "grad_norm": 0.01593591831624508, "learning_rate": 1e-06, "loss": 0.0121, "step": 1402 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21926379948854446, "epoch": 2.248397435897436, "grad_norm": 0.013704286888241768, "learning_rate": 1e-06, "loss": 0.0011, "step": 1403 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.21662156283855438, "epoch": 2.25, "grad_norm": 0.009653382934629917, "learning_rate": 1e-06, "loss": 0.0036, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3479.0, "completions/mean_length": 1432.54296875, "completions/mean_terminated_length": 1403.28369140625, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.21846728771924973, "epoch": 2.251602564102564, "frac_reward_zero_std": 0.4375, "grad_norm": 0.02795334719121456, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 958673405.0, "reward": 0.28600820899009705, "reward_std": 0.04315223544836044, "rewards/progression_diversity/mean": -0.0007437419844791293, "rewards/progression_diversity/std": 0.008263318799436092, "rewards/symbolic_reward_accuracy/mean": 0.130859375, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.6923177242279053, "rewards/symbolic_reward_partial_score/std": 0.18848715722560883, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0477138757705688, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 716.0, "sampling/sampling_logp_difference/mean": 0.16357702016830444, "step": 1405 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.22100605070590973, "epoch": 2.253205128205128, "grad_norm": 0.02042258158326149, "learning_rate": 1e-06, "loss": 0.0307, "step": 1406 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2175752967596054, "epoch": 2.2548076923076925, "grad_norm": 0.013299060985445976, "learning_rate": 1e-06, "loss": 0.0054, "step": 1407 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2246379554271698, "epoch": 2.2564102564102564, "grad_norm": 0.01301884651184082, "learning_rate": 1e-06, "loss": -0.0056, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 6158.0, "completions/mean_length": 1540.216796875, "completions/mean_terminated_length": 1482.0059814453125, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "entropy": 0.22940433025360107, "epoch": 2.2580128205128207, "frac_reward_zero_std": 0.34375, "grad_norm": 0.01926892064511776, "learning_rate": 1e-06, "loss": -0.0045, "num_tokens": 960228028.0, "reward": 0.3234546184539795, "reward_std": 0.027436789125204086, "rewards/progression_diversity/mean": -0.0012210736749693751, "rewards/progression_diversity/std": 0.010683650150895119, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.7084310054779053, "rewards/symbolic_reward_partial_score/std": 0.20990176498889923, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0487642288208008, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.48923975229263306, "step": 1409 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.23073109239339828, "epoch": 2.2596153846153846, "grad_norm": 0.005647487938404083, "learning_rate": 1e-06, "loss": -0.0, "step": 1410 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2332421839237213, "epoch": 2.261217948717949, "grad_norm": 0.02655133046209812, "learning_rate": 1e-06, "loss": 0.0222, "step": 1411 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22827783972024918, "epoch": 2.2628205128205128, "grad_norm": 0.013446721248328686, "learning_rate": 1e-06, "loss": 0.0165, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 1543.96484375, "completions/mean_terminated_length": 1456.4990234375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.22754886746406555, "epoch": 2.264423076923077, "frac_reward_zero_std": 0.375, "grad_norm": 23.43297576904297, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 961827418.0, "reward": 0.3246934413909912, "reward_std": 0.04636027663946152, "rewards/progression_diversity/mean": -0.0023379838094115257, "rewards/progression_diversity/std": 0.023444252088665962, "rewards/symbolic_reward_accuracy/mean": 0.17578125, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.7321288585662842, "rewards/symbolic_reward_partial_score/std": 0.18431484699249268, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0483700037002563, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 1.1377493143081665, "step": 1413 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.23179607093334198, "epoch": 2.266025641025641, "grad_norm": 0.014759178273379803, "learning_rate": 1e-06, "loss": -0.0051, "step": 1414 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22570616006851196, "epoch": 2.2676282051282053, "grad_norm": 0.022691620513796806, "learning_rate": 1e-06, "loss": 0.0214, "step": 1415 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23608656972646713, "epoch": 2.269230769230769, "grad_norm": 0.010918180458247662, "learning_rate": 1e-06, "loss": 0.0075, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4234.0, "completions/max_terminated_length": 4234.0, "completions/mean_length": 1428.330078125, "completions/mean_terminated_length": 1428.330078125, "completions/min_length": 361.0, "completions/min_terminated_length": 361.0, "entropy": 0.2318318784236908, "epoch": 2.2708333333333335, "frac_reward_zero_std": 0.34375, "grad_norm": 0.02078605629503727, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 963460099.0, "reward": 0.3764505386352539, "reward_std": 0.07807396352291107, "rewards/progression_diversity/mean": -0.0009436353575438261, "rewards/progression_diversity/std": 0.011008831672370434, "rewards/symbolic_reward_accuracy/mean": 0.27734375, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.7001790404319763, "rewards/symbolic_reward_partial_score/std": 0.22067667543888092, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0520446300506592, "sampling/importance_sampling_ratio/min": 0.0008081789710558951, "sampling/sampling_logp_difference/max": 7.120727062225342, "sampling/sampling_logp_difference/mean": 0.10327097773551941, "step": 1417 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23364517837762833, "epoch": 2.2724358974358974, "grad_norm": 0.018402917310595512, "learning_rate": 1e-06, "loss": 0.0045, "step": 1418 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2324867844581604, "epoch": 2.2740384615384617, "grad_norm": 0.015934674069285393, "learning_rate": 1e-06, "loss": 0.0109, "step": 1419 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23535658419132233, "epoch": 2.2756410256410255, "grad_norm": 0.021948518231511116, "learning_rate": 1e-06, "loss": -0.0146, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4152.0, "completions/mean_length": 1650.921875, "completions/mean_terminated_length": 1593.1451416015625, "completions/min_length": 475.0, "completions/min_terminated_length": 475.0, "entropy": 0.23526672273874283, "epoch": 2.27724358974359, "frac_reward_zero_std": 0.25, "grad_norm": 0.023312116041779518, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 965069179.0, "reward": 0.34999796748161316, "reward_std": 0.03697862848639488, "rewards/progression_diversity/mean": -0.0016677755629643798, "rewards/progression_diversity/std": 0.013735439628362656, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.7461426258087158, "rewards/symbolic_reward_partial_score/std": 0.19722376763820648, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.04874849319458, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 1.2209948301315308, "step": 1421 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.23266298323869705, "epoch": 2.2788461538461537, "grad_norm": 0.028419379144906998, "learning_rate": 1e-06, "loss": -0.0001, "step": 1422 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2373318374156952, "epoch": 2.280448717948718, "grad_norm": 684.584228515625, "learning_rate": 1e-06, "loss": 0.0291, "step": 1423 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23147784173488617, "epoch": 2.282051282051282, "grad_norm": 0.015300055034458637, "learning_rate": 1e-06, "loss": 0.0354, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7387.0, "completions/mean_length": 1647.84765625, "completions/mean_terminated_length": 1590.0589599609375, "completions/min_length": 459.0, "completions/min_terminated_length": 459.0, "entropy": 0.22117284685373306, "epoch": 2.2836538461538463, "frac_reward_zero_std": 0.40625, "grad_norm": 0.02695753425359726, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 966809885.0, "reward": 0.3317036032676697, "reward_std": 0.04540080577135086, "rewards/progression_diversity/mean": -0.0010283860610798001, "rewards/progression_diversity/std": 0.014310967177152634, "rewards/symbolic_reward_accuracy/mean": 0.197265625, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.7124837040901184, "rewards/symbolic_reward_partial_score/std": 0.21933910250663757, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0476967096328735, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 1.099198818206787, "step": 1425 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2237677425146103, "epoch": 2.28525641025641, "grad_norm": 0.013611048460006714, "learning_rate": 1e-06, "loss": 0.0155, "step": 1426 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.22483280301094055, "epoch": 2.2868589743589745, "grad_norm": 0.02124941535294056, "learning_rate": 1e-06, "loss": 0.004, "step": 1427 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2248641401529312, "epoch": 2.2884615384615383, "grad_norm": 0.024975134059786797, "learning_rate": 1e-06, "loss": -0.01, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2827.0, "completions/mean_length": 1540.505859375, "completions/mean_terminated_length": 1482.296142578125, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "entropy": 0.23250573128461838, "epoch": 2.2900641025641026, "frac_reward_zero_std": 0.5625, "grad_norm": 823.3074340820312, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 968396016.0, "reward": 0.2985820770263672, "reward_std": 0.015081477351486683, "rewards/progression_diversity/mean": -0.0006824900628998876, "rewards/progression_diversity/std": 0.005089160054922104, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.7465982437133789, "rewards/symbolic_reward_partial_score/std": 0.16918420791625977, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0513556003570557, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.42589616775512695, "step": 1429 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2318640798330307, "epoch": 2.2916666666666665, "grad_norm": 0.01384742185473442, "learning_rate": 1e-06, "loss": -0.0038, "step": 1430 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2302088588476181, "epoch": 2.293269230769231, "grad_norm": 0.015527794137597084, "learning_rate": 1e-06, "loss": -0.0001, "step": 1431 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.23823175579309464, "epoch": 2.2948717948717947, "grad_norm": 40.3269157409668, "learning_rate": 1e-06, "loss": 0.0411, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4556.0, "completions/mean_length": 1663.361328125, "completions/mean_terminated_length": 1576.5992431640625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.21347655355930328, "epoch": 2.296474358974359, "frac_reward_zero_std": 0.21875, "grad_norm": 78.3970947265625, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 970189689.0, "reward": 0.3578779399394989, "reward_std": 0.055181439965963364, "rewards/progression_diversity/mean": -0.0007812740514054894, "rewards/progression_diversity/std": 0.008964339271187782, "rewards/symbolic_reward_accuracy/mean": 0.25390625, "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, "rewards/symbolic_reward_partial_score/mean": 0.6870931386947632, "rewards/symbolic_reward_partial_score/std": 0.22139233350753784, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0451353788375854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.4013971090316772, "step": 1433 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.21957285702228546, "epoch": 2.298076923076923, "grad_norm": 0.013484461233019829, "learning_rate": 1e-06, "loss": -0.0143, "step": 1434 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.21703750640153885, "epoch": 2.2996794871794872, "grad_norm": 0.030732089653611183, "learning_rate": 1e-06, "loss": 0.0609, "step": 1435 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.22398217022418976, "epoch": 2.301282051282051, "grad_norm": 0.021893825381994247, "learning_rate": 1e-06, "loss": 0.0065, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3516.0, "completions/mean_length": 1687.796875, "completions/mean_terminated_length": 1601.1788330078125, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "entropy": 0.21840395033359528, "epoch": 2.3028846153846154, "frac_reward_zero_std": 0.1875, "grad_norm": 47.56247329711914, "learning_rate": 1e-06, "loss": 0.0497, "num_tokens": 971978193.0, "reward": 0.3468187153339386, "reward_std": 0.043784305453300476, "rewards/progression_diversity/mean": -0.0017242016037926078, "rewards/progression_diversity/std": 0.017134975641965866, "rewards/symbolic_reward_accuracy/mean": 0.21875, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.7205728888511658, "rewards/symbolic_reward_partial_score/std": 0.2209366112947464, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0465869903564453, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 1.2852548360824585, "step": 1437 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22310731559991837, "epoch": 2.3044871794871793, "grad_norm": 0.029605181887745857, "learning_rate": 1e-06, "loss": -0.0095, "step": 1438 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.21875733882188797, "epoch": 2.3060897435897436, "grad_norm": 0.022749239578843117, "learning_rate": 1e-06, "loss": 0.0269, "step": 1439 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.22598165273666382, "epoch": 2.3076923076923075, "grad_norm": 0.014465555548667908, "learning_rate": 1e-06, "loss": 0.0126, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3771.0, "completions/mean_length": 1645.763671875, "completions/mean_terminated_length": 1616.9217529296875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 0.22972248494625092, "epoch": 2.309294871794872, "frac_reward_zero_std": 0.34375, "grad_norm": 0.02680191770195961, "learning_rate": 1e-06, "loss": -0.0029, "num_tokens": 973626632.0, "reward": 0.3762263059616089, "reward_std": 0.05338224023580551, "rewards/progression_diversity/mean": -0.0018829565960913897, "rewards/progression_diversity/std": 0.017473464831709862, "rewards/symbolic_reward_accuracy/mean": 0.25, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.754150390625, "rewards/symbolic_reward_partial_score/std": 0.19963204860687256, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0482197999954224, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 1.3025373220443726, "step": 1441 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.22302179783582687, "epoch": 2.310897435897436, "grad_norm": 0.016930239275097847, "learning_rate": 1e-06, "loss": 0.01, "step": 1442 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.22931306064128876, "epoch": 2.3125, "grad_norm": 0.017596367746591568, "learning_rate": 1e-06, "loss": 0.0018, "step": 1443 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2299589440226555, "epoch": 2.314102564102564, "grad_norm": 0.01738305389881134, "learning_rate": 1e-06, "loss": -0.0058, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5412.0, "completions/max_terminated_length": 5412.0, "completions/mean_length": 1501.36328125, "completions/mean_terminated_length": 1501.36328125, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.23561769723892212, "epoch": 2.315705128205128, "frac_reward_zero_std": 0.375, "grad_norm": 0.024762306362390518, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 975200130.0, "reward": 0.39472496509552, "reward_std": 0.027176737785339355, "rewards/progression_diversity/mean": -0.0006506302743218839, "rewards/progression_diversity/std": 0.007446780800819397, "rewards/symbolic_reward_accuracy/mean": 0.27734375, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.7610839605331421, "rewards/symbolic_reward_partial_score/std": 0.19756866991519928, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0539369583129883, "sampling/importance_sampling_ratio/min": 4.62967858494423e-12, "sampling/sampling_logp_difference/max": 26.098533630371094, "sampling/sampling_logp_difference/mean": 0.10675007849931717, "step": 1445 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.24369582533836365, "epoch": 2.3173076923076925, "grad_norm": 0.02469770982861519, "learning_rate": 1e-06, "loss": -0.002, "step": 1446 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24448219686746597, "epoch": 2.3189102564102564, "grad_norm": 0.018661778420209885, "learning_rate": 1e-06, "loss": -0.0009, "step": 1447 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24316756427288055, "epoch": 2.3205128205128207, "grad_norm": 0.010404370725154877, "learning_rate": 1e-06, "loss": 0.0039, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3668.0, "completions/mean_length": 1508.080078125, "completions/mean_terminated_length": 1449.7431640625, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.24711936712265015, "epoch": 2.3221153846153846, "frac_reward_zero_std": 0.5, "grad_norm": 0.029730960726737976, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 976758619.0, "reward": 0.36418893933296204, "reward_std": 0.02938855066895485, "rewards/progression_diversity/mean": -0.0005412165191955864, "rewards/progression_diversity/std": 0.006034459453076124, "rewards/symbolic_reward_accuracy/mean": 0.236328125, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.742626965045929, "rewards/symbolic_reward_partial_score/std": 0.20223668217658997, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0534663200378418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.6994922161102295, "step": 1449 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.24198869615793228, "epoch": 2.323717948717949, "grad_norm": 0.018425533547997475, "learning_rate": 1e-06, "loss": 0.0282, "step": 1450 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.24656683206558228, "epoch": 2.3253205128205128, "grad_norm": 0.013853712007403374, "learning_rate": 1e-06, "loss": -0.0072, "step": 1451 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.24908402562141418, "epoch": 2.326923076923077, "grad_norm": 0.01092120073735714, "learning_rate": 1e-06, "loss": 0.0053, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 1472.052734375, "completions/mean_terminated_length": 1442.870849609375, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "entropy": 0.2532293498516083, "epoch": 2.328525641025641, "frac_reward_zero_std": 0.40625, "grad_norm": 0.021764414384961128, "learning_rate": 1e-06, "loss": -0.0082, "num_tokens": 978359622.0, "reward": 0.3215084671974182, "reward_std": 0.039750613272190094, "rewards/progression_diversity/mean": -0.0005217455327510834, "rewards/progression_diversity/std": 0.008216914720833302, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.7129882574081421, "rewards/symbolic_reward_partial_score/std": 0.21137480437755585, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0538359880447388, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.4409805238246918, "step": 1453 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.24427498131990433, "epoch": 2.3301282051282053, "grad_norm": 0.015310117974877357, "learning_rate": 1e-06, "loss": 0.004, "step": 1454 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2432166486978531, "epoch": 2.331730769230769, "grad_norm": 895.1507568359375, "learning_rate": 1e-06, "loss": 0.046, "step": 1455 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.24313075095415115, "epoch": 2.3333333333333335, "grad_norm": 0.013614215888082981, "learning_rate": 1e-06, "loss": -0.002, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3000.0, "completions/mean_length": 1463.748046875, "completions/mean_terminated_length": 1434.5499267578125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "entropy": 0.2332357093691826, "epoch": 2.3349358974358974, "frac_reward_zero_std": 0.5, "grad_norm": 0.02649446576833725, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 979968421.0, "reward": 0.33722105622291565, "reward_std": 0.0333821102976799, "rewards/progression_diversity/mean": -0.0005536978715099394, "rewards/progression_diversity/std": 0.006957771256566048, "rewards/symbolic_reward_accuracy/mean": 0.208984375, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.7067708373069763, "rewards/symbolic_reward_partial_score/std": 0.19518350064754486, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.05262291431427, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.4595038890838623, "step": 1457 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.240996353328228, "epoch": 2.3365384615384617, "grad_norm": 0.00819353200495243, "learning_rate": 1e-06, "loss": 0.0182, "step": 1458 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.23650028556585312, "epoch": 2.3381410256410255, "grad_norm": 0.021920593455433846, "learning_rate": 1e-06, "loss": 0.017, "step": 1459 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2370261400938034, "epoch": 2.33974358974359, "grad_norm": 0.011823596432805061, "learning_rate": 1e-06, "loss": 0.001, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5619.0, "completions/mean_length": 1463.85546875, "completions/mean_terminated_length": 1375.91748046875, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.22917187213897705, "epoch": 2.3413461538461537, "frac_reward_zero_std": 0.4375, "grad_norm": 196.0175018310547, "learning_rate": 1e-06, "loss": 0.0506, "num_tokens": 981567035.0, "reward": 0.39167657494544983, "reward_std": 0.050342485308647156, "rewards/progression_diversity/mean": -0.0008015389903448522, "rewards/progression_diversity/std": 0.0074392156675457954, "rewards/symbolic_reward_accuracy/mean": 0.2734375, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.7600423097610474, "rewards/symbolic_reward_partial_score/std": 0.2186768352985382, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0496225357055664, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 740.0, "sampling/sampling_logp_difference/mean": 0.7763689756393433, "step": 1461 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.23547165840864182, "epoch": 2.342948717948718, "grad_norm": 0.014117385260760784, "learning_rate": 1e-06, "loss": 0.0003, "step": 1462 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2310246080160141, "epoch": 2.344551282051282, "grad_norm": 0.038323234766721725, "learning_rate": 1e-06, "loss": 0.0181, "step": 1463 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2359386458992958, "epoch": 2.3461538461538463, "grad_norm": 0.018555141985416412, "learning_rate": 1e-06, "loss": 0.0034, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3024.0, "completions/mean_length": 1395.6953125, "completions/mean_terminated_length": 1336.917724609375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.23501623421907425, "epoch": 2.34775641025641, "frac_reward_zero_std": 0.625, "grad_norm": 378.42333984375, "learning_rate": 1e-06, "loss": 0.0315, "num_tokens": 983101327.0, "reward": 0.3867397904396057, "reward_std": 0.011792978271842003, "rewards/progression_diversity/mean": -0.00033900359994731843, "rewards/progression_diversity/std": 0.004276837222278118, "rewards/symbolic_reward_accuracy/mean": 0.248046875, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.7943521738052368, "rewards/symbolic_reward_partial_score/std": 0.1727304607629776, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0502277612686157, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 0.6608302593231201, "step": 1465 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.22964345663785934, "epoch": 2.3493589743589745, "grad_norm": 0.007282482460141182, "learning_rate": 1e-06, "loss": 0.0247, "step": 1466 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.23587816208600998, "epoch": 2.3509615384615383, "grad_norm": 0.02002773992717266, "learning_rate": 1e-06, "loss": 0.0049, "step": 1467 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.23493095487356186, "epoch": 2.3525641025641026, "grad_norm": 0.007156542036682367, "learning_rate": 1e-06, "loss": -0.0054, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 1412.869140625, "completions/mean_terminated_length": 1354.158935546875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.23347257822752, "epoch": 2.3541666666666665, "frac_reward_zero_std": 0.46875, "grad_norm": 0.020152533426880836, "learning_rate": 1e-06, "loss": 0.0307, "num_tokens": 984703036.0, "reward": 0.3319286108016968, "reward_std": 0.04241711646318436, "rewards/progression_diversity/mean": -0.0009903897298499942, "rewards/progression_diversity/std": 0.011630616150796413, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.7282063961029053, "rewards/symbolic_reward_partial_score/std": 0.18158793449401855, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0504162311553955, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 736.0, "sampling/sampling_logp_difference/mean": 0.44282588362693787, "step": 1469 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.23258361965417862, "epoch": 2.355769230769231, "grad_norm": 0.016104018315672874, "learning_rate": 1e-06, "loss": -0.0002, "step": 1470 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.22412847727537155, "epoch": 2.3573717948717947, "grad_norm": 0.009414435364305973, "learning_rate": 1e-06, "loss": 0.0275, "step": 1471 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.2289385423064232, "epoch": 2.358974358974359, "grad_norm": 0.014195187948644161, "learning_rate": 1e-06, "loss": -0.0008, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2557.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 1360.466796875, "completions/mean_terminated_length": 1360.466796875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.22527696937322617, "epoch": 2.360576923076923, "frac_reward_zero_std": 0.5, "grad_norm": 0.013988327234983444, "learning_rate": 1e-06, "loss": -0.0092, "num_tokens": 986369547.0, "reward": 0.31377702951431274, "reward_std": 0.011640192940831184, "rewards/progression_diversity/mean": -0.00022870188695378602, "rewards/progression_diversity/std": 0.0025177807547152042, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.6709309816360474, "rewards/symbolic_reward_partial_score/std": 0.20638255774974823, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.050323247909546, "sampling/importance_sampling_ratio/min": 0.003235085867345333, "sampling/sampling_logp_difference/max": 5.733699798583984, "sampling/sampling_logp_difference/mean": 0.10156988352537155, "step": 1473 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2217569351196289, "epoch": 2.3621794871794872, "grad_norm": 0.02185184508562088, "learning_rate": 1e-06, "loss": 0.0026, "step": 1474 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.22397807240486145, "epoch": 2.363782051282051, "grad_norm": 0.018456028774380684, "learning_rate": 1e-06, "loss": 0.0056, "step": 1475 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.22414569556713104, "epoch": 2.3653846153846154, "grad_norm": 0.012643019668757915, "learning_rate": 1e-06, "loss": -0.0055, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 1278.501953125, "completions/mean_terminated_length": 1189.4715576171875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.2367832437157631, "epoch": 2.3669871794871793, "frac_reward_zero_std": 0.5, "grad_norm": 0.01879701018333435, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 987864604.0, "reward": 0.43188178539276123, "reward_std": 0.03625435382127762, "rewards/progression_diversity/mean": -0.00029982085106894374, "rewards/progression_diversity/std": 0.004407213069498539, "rewards/symbolic_reward_accuracy/mean": 0.330078125, "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, "rewards/symbolic_reward_partial_score/mean": 0.7814127206802368, "rewards/symbolic_reward_partial_score/std": 0.21501034498214722, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0473359823226929, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 744.0, "sampling/sampling_logp_difference/mean": 2.7259652614593506, "step": 1477 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2279703989624977, "epoch": 2.3685897435897436, "grad_norm": 30.450448989868164, "learning_rate": 1e-06, "loss": 0.0318, "step": 1478 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.23828475177288055, "epoch": 2.3701923076923075, "grad_norm": 0.024164684116840363, "learning_rate": 1e-06, "loss": -0.0028, "step": 1479 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.23449359089136124, "epoch": 2.371794871794872, "grad_norm": 0.01611475460231304, "learning_rate": 1e-06, "loss": 0.0146, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3828.0, "completions/mean_length": 1289.505859375, "completions/mean_terminated_length": 1230.3118896484375, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "entropy": 0.23153682053089142, "epoch": 2.373397435897436, "frac_reward_zero_std": 0.5, "grad_norm": 0.02150103822350502, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 989388479.0, "reward": 0.41440147161483765, "reward_std": 0.038612596690654755, "rewards/progression_diversity/mean": -0.0002827388816513121, "rewards/progression_diversity/std": 0.003904320765286684, "rewards/symbolic_reward_accuracy/mean": 0.306640625, "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, "rewards/symbolic_reward_partial_score/mean": 0.7693685293197632, "rewards/symbolic_reward_partial_score/std": 0.21836112439632416, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0490467548370361, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 748.0, "sampling/sampling_logp_difference/mean": 1.6480077505111694, "step": 1481 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2258523479104042, "epoch": 2.375, "grad_norm": 0.009424294345080853, "learning_rate": 1e-06, "loss": 0.1124, "step": 1482 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.23436233401298523, "epoch": 2.376602564102564, "grad_norm": 0.010978851467370987, "learning_rate": 1e-06, "loss": 0.0041, "step": 1483 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2368711531162262, "epoch": 2.378205128205128, "grad_norm": 0.015040039084851742, "learning_rate": 1e-06, "loss": -0.0046, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 1375.23046875, "completions/mean_terminated_length": 1286.7701416015625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "entropy": 0.22265422344207764, "epoch": 2.3798076923076925, "frac_reward_zero_std": 0.40625, "grad_norm": 365.71807861328125, "learning_rate": 1e-06, "loss": 0.0797, "num_tokens": 990941957.0, "reward": 0.2840445041656494, "reward_std": 0.026247035712003708, "rewards/progression_diversity/mean": -0.00033585538039915264, "rewards/progression_diversity/std": 0.0034242472611367702, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.706591784954071, "rewards/symbolic_reward_partial_score/std": 0.19762486219406128, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0476056337356567, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 748.0, "sampling/sampling_logp_difference/mean": 2.300495147705078, "step": 1485 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2373371571302414, "epoch": 2.3814102564102564, "grad_norm": 0.01105690747499466, "learning_rate": 1e-06, "loss": 0.0004, "step": 1486 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.23985521495342255, "epoch": 2.3830128205128207, "grad_norm": 0.015546144917607307, "learning_rate": 1e-06, "loss": -0.0024, "step": 1487 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23665790259838104, "epoch": 2.3846153846153846, "grad_norm": 0.01926671527326107, "learning_rate": 1e-06, "loss": -0.0068, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 1406.267578125, "completions/mean_terminated_length": 1288.3326416015625, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "entropy": 0.23878996819257736, "epoch": 2.386217948717949, "frac_reward_zero_std": 0.4375, "grad_norm": 0.019514024257659912, "learning_rate": 1e-06, "loss": 0.0072, "num_tokens": 992456558.0, "reward": 0.2826894521713257, "reward_std": 0.03426457941532135, "rewards/progression_diversity/mean": -0.0005875998758710921, "rewards/progression_diversity/std": 0.00664177630096674, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.7027344107627869, "rewards/symbolic_reward_partial_score/std": 0.1787167340517044, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0489890575408936, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 2.2175588607788086, "step": 1489 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24255738407373428, "epoch": 2.3878205128205128, "grad_norm": 2492.634033203125, "learning_rate": 1e-06, "loss": 0.0505, "step": 1490 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.24588491022586823, "epoch": 2.389423076923077, "grad_norm": 0.017873678356409073, "learning_rate": 1e-06, "loss": -0.0036, "step": 1491 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.24287723004817963, "epoch": 2.391025641025641, "grad_norm": 0.013864979147911072, "learning_rate": 1e-06, "loss": 0.0298, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3078.0, "completions/mean_length": 1381.619140625, "completions/mean_terminated_length": 1322.786376953125, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "entropy": 0.23624292761087418, "epoch": 2.3926282051282053, "frac_reward_zero_std": 0.53125, "grad_norm": 0.01766468957066536, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 994038987.0, "reward": 0.3370998203754425, "reward_std": 0.02162216417491436, "rewards/progression_diversity/mean": -0.000469559890916571, "rewards/progression_diversity/std": 0.005131890531629324, "rewards/symbolic_reward_accuracy/mean": 0.17578125, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.7734212279319763, "rewards/symbolic_reward_partial_score/std": 0.15896020829677582, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0494273900985718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 1.574073076248169, "step": 1493 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.23203369975090027, "epoch": 2.394230769230769, "grad_norm": 0.007126044947654009, "learning_rate": 1e-06, "loss": 0.0539, "step": 1494 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.23312392085790634, "epoch": 2.3958333333333335, "grad_norm": 0.007795875892043114, "learning_rate": 1e-06, "loss": -0.0035, "step": 1495 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.23666821420192719, "epoch": 2.3974358974358974, "grad_norm": 0.007526164874434471, "learning_rate": 1e-06, "loss": 0.0024, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 1428.548828125, "completions/mean_terminated_length": 1281.0592041015625, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "entropy": 0.23371727764606476, "epoch": 2.3990384615384617, "frac_reward_zero_std": 0.59375, "grad_norm": 837.6632080078125, "learning_rate": 1e-06, "loss": 0.0729, "num_tokens": 995634948.0, "reward": 0.3479752242565155, "reward_std": 0.019150175154209137, "rewards/progression_diversity/mean": -0.00033024564618244767, "rewards/progression_diversity/std": 0.004327813629060984, "rewards/symbolic_reward_accuracy/mean": 0.216796875, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.7295898199081421, "rewards/symbolic_reward_partial_score/std": 0.2091808170080185, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.046621322631836, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 3.1054999828338623, "step": 1497 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.23887716233730316, "epoch": 2.4006410256410255, "grad_norm": 7.230476379394531, "learning_rate": 1e-06, "loss": 0.0044, "step": 1498 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.24147003889083862, "epoch": 2.40224358974359, "grad_norm": 0.013911988586187363, "learning_rate": 1e-06, "loss": -0.0055, "step": 1499 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24635042250156403, "epoch": 2.4038461538461537, "grad_norm": 0.013229484669864178, "learning_rate": 1e-06, "loss": 0.023, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 5214.0, "completions/mean_length": 1461.9140625, "completions/mean_terminated_length": 1314.75341796875, "completions/min_length": 430.0, "completions/min_terminated_length": 430.0, "entropy": 0.2452658787369728, "epoch": 2.405448717948718, "frac_reward_zero_std": 0.53125, "grad_norm": 125.90882873535156, "learning_rate": 1e-06, "loss": 0.0306, "num_tokens": 997203848.0, "reward": 0.2770632207393646, "reward_std": 0.023135796189308167, "rewards/progression_diversity/mean": -0.0007097757770679891, "rewards/progression_diversity/std": 0.01300110761076212, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.6839843988418579, "rewards/symbolic_reward_partial_score/std": 0.1883450597524643, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0471558570861816, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 2.527804374694824, "step": 1501 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.23859861493110657, "epoch": 2.407051282051282, "grad_norm": 577.0975952148438, "learning_rate": 1e-06, "loss": 0.0477, "step": 1502 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.2435135617852211, "epoch": 2.4086538461538463, "grad_norm": 0.014631603844463825, "learning_rate": 1e-06, "loss": 0.0252, "step": 1503 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2454121932387352, "epoch": 2.41025641025641, "grad_norm": 0.008022090420126915, "learning_rate": 1e-06, "loss": 0.0225, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4812.0, "completions/mean_length": 1448.013671875, "completions/mean_terminated_length": 1330.407470703125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "entropy": 0.24072068184614182, "epoch": 2.4118589743589745, "frac_reward_zero_std": 0.4375, "grad_norm": 0.024386148899793625, "learning_rate": 1e-06, "loss": -0.0065, "num_tokens": 998845663.0, "reward": 0.31266647577285767, "reward_std": 0.029008327051997185, "rewards/progression_diversity/mean": -0.00044497710769064724, "rewards/progression_diversity/std": 0.007153489161282778, "rewards/symbolic_reward_accuracy/mean": 0.166015625, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.7128092646598816, "rewards/symbolic_reward_partial_score/std": 0.1801445633172989, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0494557619094849, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 1.1010427474975586, "step": 1505 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22912582755088806, "epoch": 2.4134615384615383, "grad_norm": 0.013357684947550297, "learning_rate": 1e-06, "loss": 0.0729, "step": 1506 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.238428495824337, "epoch": 2.4150641025641026, "grad_norm": 0.02128361538052559, "learning_rate": 1e-06, "loss": 0.0199, "step": 1507 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.23703129589557648, "epoch": 2.4166666666666665, "grad_norm": 0.011308502405881882, "learning_rate": 1e-06, "loss": 0.0045, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2632.0, "completions/mean_length": 1454.595703125, "completions/mean_terminated_length": 1366.6031494140625, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 0.23324482142925262, "epoch": 2.418269230769231, "frac_reward_zero_std": 0.46875, "grad_norm": 0.020565425977110863, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 1000476448.0, "reward": 0.35293227434158325, "reward_std": 0.02206794172525406, "rewards/progression_diversity/mean": -0.00022909138351678848, "rewards/progression_diversity/std": 0.0050324564799666405, "rewards/symbolic_reward_accuracy/mean": 0.21875, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.7409017086029053, "rewards/symbolic_reward_partial_score/std": 0.1797100305557251, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.047702431678772, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 756.0, "sampling/sampling_logp_difference/mean": 2.1723339557647705, "step": 1509 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2380816489458084, "epoch": 2.4198717948717947, "grad_norm": 0.009358495473861694, "learning_rate": 1e-06, "loss": 0.0104, "step": 1510 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.23932605236768723, "epoch": 2.421474358974359, "grad_norm": 0.012663176283240318, "learning_rate": 1e-06, "loss": 0.033, "step": 1511 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.23995714634656906, "epoch": 2.423076923076923, "grad_norm": 0.008335144259035587, "learning_rate": 1e-06, "loss": -0.0006, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 1581.033203125, "completions/mean_terminated_length": 1375.8436279296875, "completions/min_length": 429.0, "completions/min_terminated_length": 429.0, "entropy": 0.23756451904773712, "epoch": 2.4246794871794872, "frac_reward_zero_std": 0.40625, "grad_norm": 581.8189697265625, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 1002237585.0, "reward": 0.3079501986503601, "reward_std": 0.04506230354309082, "rewards/progression_diversity/mean": -0.0013673059875145555, "rewards/progression_diversity/std": 0.019772524014115334, "rewards/symbolic_reward_accuracy/mean": 0.166015625, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.6990722417831421, "rewards/symbolic_reward_partial_score/std": 0.19391867518424988, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0470722913742065, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 2.45670747756958, "step": 1513 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2470928058028221, "epoch": 2.426282051282051, "grad_norm": 0.01648869179189205, "learning_rate": 1e-06, "loss": -0.0059, "step": 1514 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.23297542333602905, "epoch": 2.4278846153846154, "grad_norm": 384.6247253417969, "learning_rate": 1e-06, "loss": 0.0991, "step": 1515 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.23933256417512894, "epoch": 2.4294871794871793, "grad_norm": 0.019361699000000954, "learning_rate": 1e-06, "loss": 0.0483, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 8324.0, "completions/mean_length": 1477.720703125, "completions/mean_terminated_length": 1360.348388671875, "completions/min_length": 362.0, "completions/min_terminated_length": 362.0, "entropy": 0.2378390207886696, "epoch": 2.4310897435897436, "frac_reward_zero_std": 0.53125, "grad_norm": 0.024526890367269516, "learning_rate": 1e-06, "loss": 0.0322, "num_tokens": 1003883634.0, "reward": 0.3165303170681, "reward_std": 0.02786339819431305, "rewards/progression_diversity/mean": -0.000289862509816885, "rewards/progression_diversity/std": 0.004467545077204704, "rewards/symbolic_reward_accuracy/mean": 0.146484375, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.7647460699081421, "rewards/symbolic_reward_partial_score/std": 0.18048354983329773, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0483635663986206, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.4491806030273438, "step": 1517 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2445373684167862, "epoch": 2.4326923076923075, "grad_norm": 0.01748676225543022, "learning_rate": 1e-06, "loss": 0.0166, "step": 1518 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.23935458809137344, "epoch": 2.434294871794872, "grad_norm": 0.04788840189576149, "learning_rate": 1e-06, "loss": 0.0312, "step": 1519 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.24610072374343872, "epoch": 2.435897435897436, "grad_norm": 0.014646312221884727, "learning_rate": 1e-06, "loss": 0.0119, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3350.0, "completions/mean_length": 1556.814453125, "completions/mean_terminated_length": 1469.4244384765625, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.24429644644260406, "epoch": 2.4375, "frac_reward_zero_std": 0.34375, "grad_norm": 461.6620788574219, "learning_rate": 1e-06, "loss": 0.0507, "num_tokens": 1005568947.0, "reward": 0.3079223334789276, "reward_std": 0.038352809846401215, "rewards/progression_diversity/mean": -0.00024961589951999485, "rewards/progression_diversity/std": 0.004486089572310448, "rewards/symbolic_reward_accuracy/mean": 0.14453125, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.739306628704071, "rewards/symbolic_reward_partial_score/std": 0.18258456885814667, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0502259731292725, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.174125909805298, "step": 1521 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.24371539801359177, "epoch": 2.439102564102564, "grad_norm": 0.015633653849363327, "learning_rate": 1e-06, "loss": 0.0287, "step": 1522 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2456585019826889, "epoch": 2.440705128205128, "grad_norm": 0.021190311759710312, "learning_rate": 1e-06, "loss": 0.0031, "step": 1523 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.24361753463745117, "epoch": 2.4423076923076925, "grad_norm": 0.011759229004383087, "learning_rate": 1e-06, "loss": -0.0044, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5319.0, "completions/mean_length": 1642.1484375, "completions/mean_terminated_length": 1467.3438720703125, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.24563409388065338, "epoch": 2.4439102564102564, "frac_reward_zero_std": 0.28125, "grad_norm": 1274.161376953125, "learning_rate": 1e-06, "loss": 0.0363, "num_tokens": 1007251439.0, "reward": 0.3221524655818939, "reward_std": 0.05205688625574112, "rewards/progression_diversity/mean": -0.0015531220706179738, "rewards/progression_diversity/std": 0.016780469566583633, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.6982421875, "rewards/symbolic_reward_partial_score/std": 0.20122045278549194, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0466976165771484, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.579723834991455, "step": 1525 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.23505118489265442, "epoch": 2.4455128205128207, "grad_norm": 5386.1142578125, "learning_rate": 1e-06, "loss": 0.7638, "step": 1526 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23239125311374664, "epoch": 2.4471153846153846, "grad_norm": 0.012965874746441841, "learning_rate": 1e-06, "loss": 0.0457, "step": 1527 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.23721230030059814, "epoch": 2.448717948717949, "grad_norm": 0.01935112476348877, "learning_rate": 1e-06, "loss": 0.0124, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5069.0, "completions/mean_length": 1434.068359375, "completions/mean_terminated_length": 1345.954833984375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.2473963499069214, "epoch": 2.4503205128205128, "frac_reward_zero_std": 0.4375, "grad_norm": 0.016699276864528656, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 1008849506.0, "reward": 0.3746347427368164, "reward_std": 0.0481521412730217, "rewards/progression_diversity/mean": -0.0008823598036542535, "rewards/progression_diversity/std": 0.01371445506811142, "rewards/symbolic_reward_accuracy/mean": 0.25390625, "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, "rewards/symbolic_reward_partial_score/mean": 0.7429524660110474, "rewards/symbolic_reward_partial_score/std": 0.21133190393447876, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0495123863220215, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 2.0027153491973877, "step": 1529 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.24604248255491257, "epoch": 2.451923076923077, "grad_norm": 0.012123506516218185, "learning_rate": 1e-06, "loss": 0.013, "step": 1530 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.24536657333374023, "epoch": 2.453525641025641, "grad_norm": 0.010596076026558876, "learning_rate": 1e-06, "loss": -0.007, "step": 1531 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.23595212399959564, "epoch": 2.4551282051282053, "grad_norm": 0.009966249577701092, "learning_rate": 1e-06, "loss": 0.1189, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5517.0, "completions/mean_length": 1541.44921875, "completions/mean_terminated_length": 1424.5787353515625, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.23892727494239807, "epoch": 2.456730769230769, "frac_reward_zero_std": 0.3125, "grad_norm": 0.019874002784490585, "learning_rate": 1e-06, "loss": -0.0075, "num_tokens": 1010481960.0, "reward": 0.4098053574562073, "reward_std": 0.033480141311883926, "rewards/progression_diversity/mean": -0.001889547100290656, "rewards/progression_diversity/std": 0.03145535662770271, "rewards/symbolic_reward_accuracy/mean": 0.30859375, "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, "rewards/symbolic_reward_partial_score/mean": 0.7514973878860474, "rewards/symbolic_reward_partial_score/std": 0.20948295295238495, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0455158948898315, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.3153014183044434, "step": 1533 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.23280858993530273, "epoch": 2.4583333333333335, "grad_norm": 0.01161316316574812, "learning_rate": 1e-06, "loss": 0.0201, "step": 1534 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2302737832069397, "epoch": 2.4599358974358974, "grad_norm": 0.030310511589050293, "learning_rate": 1e-06, "loss": 0.0335, "step": 1535 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.23068059235811234, "epoch": 2.4615384615384617, "grad_norm": 0.010563918389379978, "learning_rate": 1e-06, "loss": 0.054, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4376.0, "completions/mean_length": 1490.501953125, "completions/mean_terminated_length": 1373.2303466796875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.2301376685500145, "epoch": 2.4631410256410255, "frac_reward_zero_std": 0.34375, "grad_norm": 0.016749106347560883, "learning_rate": 1e-06, "loss": -0.0055, "num_tokens": 1012094073.0, "reward": 0.36312228441238403, "reward_std": 0.039926785975694656, "rewards/progression_diversity/mean": -0.0017360053025186062, "rewards/progression_diversity/std": 0.01971437782049179, "rewards/symbolic_reward_accuracy/mean": 0.236328125, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.7404134273529053, "rewards/symbolic_reward_partial_score/std": 0.20340432226657867, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045566439628601, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 2.109564781188965, "step": 1537 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22669820487499237, "epoch": 2.46474358974359, "grad_norm": 0.013202717527747154, "learning_rate": 1e-06, "loss": 0.0247, "step": 1538 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.23074190318584442, "epoch": 2.4663461538461537, "grad_norm": 0.028765065595507622, "learning_rate": 1e-06, "loss": 0.0033, "step": 1539 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.224589504301548, "epoch": 2.467948717948718, "grad_norm": 0.021383749321103096, "learning_rate": 1e-06, "loss": 0.0617, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4875.0, "completions/mean_length": 1534.798828125, "completions/mean_terminated_length": 1388.3570556640625, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 0.2195545881986618, "epoch": 2.469551282051282, "frac_reward_zero_std": 0.40625, "grad_norm": 0.039864543825387955, "learning_rate": 1e-06, "loss": 0.0975, "num_tokens": 1013848578.0, "reward": 0.2403162568807602, "reward_std": 0.0261523500084877, "rewards/progression_diversity/mean": -0.0010903782676905394, "rewards/progression_diversity/std": 0.020245160907506943, "rewards/symbolic_reward_accuracy/mean": 0.060546875, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.6832519769668579, "rewards/symbolic_reward_partial_score/std": 0.18514306843280792, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0433101654052734, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.134794235229492, "step": 1541 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.22596587240695953, "epoch": 2.4711538461538463, "grad_norm": 0.01767021417617798, "learning_rate": 1e-06, "loss": -0.0098, "step": 1542 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.22497133910655975, "epoch": 2.47275641025641, "grad_norm": 0.013781007379293442, "learning_rate": 1e-06, "loss": 0.0239, "step": 1543 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2270878478884697, "epoch": 2.4743589743589745, "grad_norm": 0.011908365413546562, "learning_rate": 1e-06, "loss": 0.019, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3915.0, "completions/mean_length": 1276.716796875, "completions/mean_terminated_length": 1247.152587890625, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "entropy": 0.23582519590854645, "epoch": 2.4759615384615383, "frac_reward_zero_std": 0.65625, "grad_norm": 581.7129516601562, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 1015333441.0, "reward": 0.40914207696914673, "reward_std": 0.014557499438524246, "rewards/progression_diversity/mean": -0.0003453929675742984, "rewards/progression_diversity/std": 0.00676377210766077, "rewards/symbolic_reward_accuracy/mean": 0.310546875, "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, "rewards/symbolic_reward_partial_score/mean": 0.7433756589889526, "rewards/symbolic_reward_partial_score/std": 0.229752317070961, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0511703491210938, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 0.7563881278038025, "step": 1545 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.23144873976707458, "epoch": 2.4775641025641026, "grad_norm": 0.010454997420310974, "learning_rate": 1e-06, "loss": 0.0032, "step": 1546 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2326817438006401, "epoch": 2.4791666666666665, "grad_norm": 0.01226350199431181, "learning_rate": 1e-06, "loss": -0.0076, "step": 1547 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.23457026481628418, "epoch": 2.480769230769231, "grad_norm": 0.009266077540814877, "learning_rate": 1e-06, "loss": -0.0032, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4929.0, "completions/mean_length": 1437.140625, "completions/mean_terminated_length": 1289.7357177734375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "entropy": 0.21618512272834778, "epoch": 2.4823717948717947, "frac_reward_zero_std": 0.34375, "grad_norm": 437.86102294921875, "learning_rate": 1e-06, "loss": 0.0556, "num_tokens": 1016930265.0, "reward": 0.3245599865913391, "reward_std": 0.04626595973968506, "rewards/progression_diversity/mean": -0.0015235436148941517, "rewards/progression_diversity/std": 0.01856166310608387, "rewards/symbolic_reward_accuracy/mean": 0.193359375, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.6984537839889526, "rewards/symbolic_reward_partial_score/std": 0.2078658640384674, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044947862625122, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 756.0, "sampling/sampling_logp_difference/mean": 2.158604145050049, "step": 1549 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.23416189849376678, "epoch": 2.483974358974359, "grad_norm": 0.011794805526733398, "learning_rate": 1e-06, "loss": -0.0026, "step": 1550 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2325107902288437, "epoch": 2.485576923076923, "grad_norm": 0.018296558409929276, "learning_rate": 1e-06, "loss": 0.0022, "step": 1551 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.21691740304231644, "epoch": 2.4871794871794872, "grad_norm": 0.03286243975162506, "learning_rate": 1e-06, "loss": 0.0341, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4565.0, "completions/mean_length": 1326.94921875, "completions/mean_terminated_length": 1238.204345703125, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.21416771411895752, "epoch": 2.488782051282051, "frac_reward_zero_std": 0.34375, "grad_norm": 242.9971160888672, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 1018587343.0, "reward": 0.24987833201885223, "reward_std": 0.025197582319378853, "rewards/progression_diversity/mean": -0.0004485528916120529, "rewards/progression_diversity/std": 0.0056631616316735744, "rewards/symbolic_reward_accuracy/mean": 0.08984375, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.6552083492279053, "rewards/symbolic_reward_partial_score/std": 0.1901572048664093, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0427167415618896, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.669922351837158, "step": 1553 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.21156911551952362, "epoch": 2.4903846153846154, "grad_norm": 0.014419353567063808, "learning_rate": 1e-06, "loss": 0.0138, "step": 1554 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.21717411279678345, "epoch": 2.4919871794871793, "grad_norm": 0.011920792050659657, "learning_rate": 1e-06, "loss": 0.0193, "step": 1555 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.21699684858322144, "epoch": 2.4935897435897436, "grad_norm": 0.011259951628744602, "learning_rate": 1e-06, "loss": 0.0012, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3890.0, "completions/mean_length": 1333.09765625, "completions/mean_terminated_length": 1244.3890380859375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.2051466777920723, "epoch": 2.4951923076923075, "frac_reward_zero_std": 0.53125, "grad_norm": 179.58285522460938, "learning_rate": 1e-06, "loss": 0.0567, "num_tokens": 1020162753.0, "reward": 0.4125392436981201, "reward_std": 0.02173442952334881, "rewards/progression_diversity/mean": -0.0004710496577899903, "rewards/progression_diversity/std": 0.004797840025275946, "rewards/symbolic_reward_accuracy/mean": 0.3046875, "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, "rewards/symbolic_reward_partial_score/mean": 0.7670735716819763, "rewards/symbolic_reward_partial_score/std": 0.21079780161380768, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0423927307128906, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 1.7513947486877441, "step": 1557 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.21501071751117706, "epoch": 2.496794871794872, "grad_norm": 0.00948250014334917, "learning_rate": 1e-06, "loss": -0.0021, "step": 1558 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2102562114596367, "epoch": 2.498397435897436, "grad_norm": 0.007870234549045563, "learning_rate": 1e-06, "loss": 0.0207, "step": 1559 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.21096129715442657, "epoch": 2.5, "grad_norm": 0.007708584889769554, "learning_rate": 1e-06, "loss": -0.0001, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4414.0, "completions/mean_length": 1309.21484375, "completions/mean_terminated_length": 1220.365478515625, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.20147471129894257, "epoch": 2.501602564102564, "frac_reward_zero_std": 0.34375, "grad_norm": 0.02653343416750431, "learning_rate": 1e-06, "loss": 0.0258, "num_tokens": 1021758815.0, "reward": 0.30301350355148315, "reward_std": 0.051949888467788696, "rewards/progression_diversity/mean": -0.003340594470500946, "rewards/progression_diversity/std": 0.030672553926706314, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.7074218988418579, "rewards/symbolic_reward_partial_score/std": 0.1824207305908203, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0406644344329834, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.7471396923065186, "step": 1561 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.20691817998886108, "epoch": 2.503205128205128, "grad_norm": 0.018390124663710594, "learning_rate": 1e-06, "loss": 0.0263, "step": 1562 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.20330524444580078, "epoch": 2.5048076923076925, "grad_norm": 0.01390912476927042, "learning_rate": 1e-06, "loss": 0.0307, "step": 1563 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.20695596933364868, "epoch": 2.5064102564102564, "grad_norm": 0.01517625991255045, "learning_rate": 1e-06, "loss": -0.0015, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4008.0, "completions/mean_length": 1362.990234375, "completions/mean_terminated_length": 1214.85400390625, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "entropy": 0.19462278485298157, "epoch": 2.5080128205128203, "frac_reward_zero_std": 0.4375, "grad_norm": 506.2625732421875, "learning_rate": 1e-06, "loss": 0.0486, "num_tokens": 1023359562.0, "reward": 0.34047943353652954, "reward_std": 0.035891205072402954, "rewards/progression_diversity/mean": -0.0008854115731082857, "rewards/progression_diversity/std": 0.008040647022426128, "rewards/symbolic_reward_accuracy/mean": 0.205078125, "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, "rewards/symbolic_reward_partial_score/mean": 0.7280598878860474, "rewards/symbolic_reward_partial_score/std": 0.1988910287618637, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.037065029144287, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.887390375137329, "step": 1565 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.20349303632974625, "epoch": 2.5096153846153846, "grad_norm": 0.02085845172405243, "learning_rate": 1e-06, "loss": 0.0164, "step": 1566 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.1964193433523178, "epoch": 2.511217948717949, "grad_norm": 0.010187533684074879, "learning_rate": 1e-06, "loss": 0.0258, "step": 1567 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.20827952772378922, "epoch": 2.5128205128205128, "grad_norm": 0.00918852724134922, "learning_rate": 1e-06, "loss": 0.0049, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4599.0, "completions/mean_length": 1323.048828125, "completions/mean_terminated_length": 1234.281005859375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "entropy": 0.21191302686929703, "epoch": 2.5144230769230766, "frac_reward_zero_std": 0.3125, "grad_norm": 0.020111212506890297, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 1024883955.0, "reward": 0.3286048173904419, "reward_std": 0.0436086431145668, "rewards/progression_diversity/mean": -0.0013349888613447547, "rewards/progression_diversity/std": 0.01073912438005209, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.7184407711029053, "rewards/symbolic_reward_partial_score/std": 0.20771624147891998, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0429773330688477, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 756.0, "sampling/sampling_logp_difference/mean": 1.301384687423706, "step": 1569 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2052868902683258, "epoch": 2.516025641025641, "grad_norm": 0.018205396831035614, "learning_rate": 1e-06, "loss": 0.0922, "step": 1570 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.20686771720647812, "epoch": 2.5176282051282053, "grad_norm": 0.010167265310883522, "learning_rate": 1e-06, "loss": 0.0094, "step": 1571 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.20857567340135574, "epoch": 2.519230769230769, "grad_norm": 0.012847342528402805, "learning_rate": 1e-06, "loss": 0.026, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 1316.029296875, "completions/mean_terminated_length": 1197.3839111328125, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.2158561423420906, "epoch": 2.5208333333333335, "frac_reward_zero_std": 0.3125, "grad_norm": 0.013560623861849308, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 1026358946.0, "reward": 0.29157090187072754, "reward_std": 0.04181563854217529, "rewards/progression_diversity/mean": -0.0011149711208418012, "rewards/progression_diversity/std": 0.010894270613789558, "rewards/symbolic_reward_accuracy/mean": 0.130859375, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.7115234136581421, "rewards/symbolic_reward_partial_score/std": 0.1730296015739441, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0387065410614014, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.532683849334717, "step": 1573 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.20614393055438995, "epoch": 2.5224358974358974, "grad_norm": 0.023240137845277786, "learning_rate": 1e-06, "loss": 0.0632, "step": 1574 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.21372488886117935, "epoch": 2.5240384615384617, "grad_norm": 0.016561856493353844, "learning_rate": 1e-06, "loss": -0.0029, "step": 1575 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.209242083132267, "epoch": 2.5256410256410255, "grad_norm": 0.011606593616306782, "learning_rate": 1e-06, "loss": 0.0508, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4388.0, "completions/mean_length": 1337.361328125, "completions/mean_terminated_length": 1218.8839111328125, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.21050991117954254, "epoch": 2.52724358974359, "frac_reward_zero_std": 0.34375, "grad_norm": 682.1195068359375, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 1027800187.0, "reward": 0.3529677093029022, "reward_std": 0.03494004160165787, "rewards/progression_diversity/mean": -0.002058391459286213, "rewards/progression_diversity/std": 0.028598375618457794, "rewards/symbolic_reward_accuracy/mean": 0.21875, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.7410807609558105, "rewards/symbolic_reward_partial_score/std": 0.19470229744911194, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0408451557159424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.35636568069458, "step": 1577 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.20368197560310364, "epoch": 2.5288461538461537, "grad_norm": 0.014921136200428009, "learning_rate": 1e-06, "loss": 0.0237, "step": 1578 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.21754014492034912, "epoch": 2.530448717948718, "grad_norm": 0.018416838720440865, "learning_rate": 1e-06, "loss": -0.0029, "step": 1579 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.21085964888334274, "epoch": 2.532051282051282, "grad_norm": 0.022706199437379837, "learning_rate": 1e-06, "loss": 0.0528, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4464.0, "completions/mean_length": 1427.904296875, "completions/mean_terminated_length": 1250.559326171875, "completions/min_length": 433.0, "completions/min_terminated_length": 433.0, "entropy": 0.21092313528060913, "epoch": 2.5336538461538463, "frac_reward_zero_std": 0.28125, "grad_norm": 0.020028864964842796, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 1029409994.0, "reward": 0.32646113634109497, "reward_std": 0.04266618192195892, "rewards/progression_diversity/mean": -0.0013511213473975658, "rewards/progression_diversity/std": 0.012499148957431316, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.7243163585662842, "rewards/symbolic_reward_partial_score/std": 0.20394515991210938, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0392813682556152, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.5135602951049805, "step": 1581 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.20285531133413315, "epoch": 2.53525641025641, "grad_norm": 0.013795009814202785, "learning_rate": 1e-06, "loss": 0.035, "step": 1582 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2033623531460762, "epoch": 2.5368589743589745, "grad_norm": 0.010789959691464901, "learning_rate": 1e-06, "loss": 0.0538, "step": 1583 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2029578685760498, "epoch": 2.5384615384615383, "grad_norm": 0.009514546021819115, "learning_rate": 1e-06, "loss": 0.0284, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3833.0, "completions/mean_length": 1337.1015625, "completions/mean_terminated_length": 1218.6220703125, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "entropy": 0.19549165666103363, "epoch": 2.5400641025641026, "frac_reward_zero_std": 0.3125, "grad_norm": 663.7005004882812, "learning_rate": 1e-06, "loss": 0.0671, "num_tokens": 1031023614.0, "reward": 0.3425098657608032, "reward_std": 0.050741590559482574, "rewards/progression_diversity/mean": -0.0014545343583449721, "rewards/progression_diversity/std": 0.01686347834765911, "rewards/symbolic_reward_accuracy/mean": 0.220703125, "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, "rewards/symbolic_reward_partial_score/mean": 0.7029459476470947, "rewards/symbolic_reward_partial_score/std": 0.21629534661769867, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0390223264694214, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.6212563514709473, "step": 1585 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2048923447728157, "epoch": 2.5416666666666665, "grad_norm": 0.015606733970344067, "learning_rate": 1e-06, "loss": -0.0031, "step": 1586 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.19919227808713913, "epoch": 2.543269230769231, "grad_norm": 0.02132081612944603, "learning_rate": 1e-06, "loss": 0.0211, "step": 1587 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2042047083377838, "epoch": 2.5448717948717947, "grad_norm": 0.01672261208295822, "learning_rate": 1e-06, "loss": -0.0112, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3849.0, "completions/mean_length": 1405.666015625, "completions/mean_terminated_length": 1287.726318359375, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "entropy": 0.20447014272212982, "epoch": 2.546474358974359, "frac_reward_zero_std": 0.40625, "grad_norm": 262.0090637207031, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 1032674947.0, "reward": 0.3156275749206543, "reward_std": 0.04013008996844292, "rewards/progression_diversity/mean": -0.0007197089726105332, "rewards/progression_diversity/std": 0.008052974939346313, "rewards/symbolic_reward_accuracy/mean": 0.181640625, "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, "rewards/symbolic_reward_partial_score/mean": 0.6914387941360474, "rewards/symbolic_reward_partial_score/std": 0.2133348137140274, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0389673709869385, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.4872024059295654, "step": 1589 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.1969490796327591, "epoch": 2.5480769230769234, "grad_norm": 0.020587697625160217, "learning_rate": 1e-06, "loss": 0.0394, "step": 1590 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2012443020939827, "epoch": 2.5496794871794872, "grad_norm": 0.012877855449914932, "learning_rate": 1e-06, "loss": -0.004, "step": 1591 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2060464546084404, "epoch": 2.551282051282051, "grad_norm": 0.017594829201698303, "learning_rate": 1e-06, "loss": 0.0178, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4988.0, "completions/mean_length": 1498.439453125, "completions/mean_terminated_length": 1351.6390380859375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "entropy": 0.20640329271554947, "epoch": 2.5528846153846154, "frac_reward_zero_std": 0.28125, "grad_norm": 406.1407470703125, "learning_rate": 1e-06, "loss": 0.0347, "num_tokens": 1034189668.0, "reward": 0.30137068033218384, "reward_std": 0.058662112802267075, "rewards/progression_diversity/mean": -0.00013947187107987702, "rewards/progression_diversity/std": 0.0018558743176981807, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.6946777105331421, "rewards/symbolic_reward_partial_score/std": 0.20915855467319489, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0420677661895752, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 756.0, "sampling/sampling_logp_difference/mean": 1.9947669506072998, "step": 1593 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.21118755638599396, "epoch": 2.5544871794871797, "grad_norm": 1052.406494140625, "learning_rate": 1e-06, "loss": 0.043, "step": 1594 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2124936804175377, "epoch": 2.5560897435897436, "grad_norm": 0.026562146842479706, "learning_rate": 1e-06, "loss": 0.0204, "step": 1595 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.21463356912136078, "epoch": 2.5576923076923075, "grad_norm": 0.01585652120411396, "learning_rate": 1e-06, "loss": 0.0246, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4207.0, "completions/mean_length": 1389.62890625, "completions/mean_terminated_length": 1330.8275146484375, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "entropy": 0.2117195725440979, "epoch": 2.559294871794872, "frac_reward_zero_std": 0.375, "grad_norm": 0.02877124771475792, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 1035825670.0, "reward": 0.3299679160118103, "reward_std": 0.03433844819664955, "rewards/progression_diversity/mean": -0.00028184783877804875, "rewards/progression_diversity/std": 0.003830707399174571, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.7262043952941895, "rewards/symbolic_reward_partial_score/std": 0.18292047083377838, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0448970794677734, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 0.6958411931991577, "step": 1597 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.20743755996227264, "epoch": 2.560897435897436, "grad_norm": 0.028087658807635307, "learning_rate": 1e-06, "loss": 0.0076, "step": 1598 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.20653068274259567, "epoch": 2.5625, "grad_norm": 0.014355894178152084, "learning_rate": 1e-06, "loss": 0.0279, "step": 1599 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2106596827507019, "epoch": 2.564102564102564, "grad_norm": 0.01578899845480919, "learning_rate": 1e-06, "loss": 0.0246, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 1489.35546875, "completions/mean_terminated_length": 1282.8951416015625, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "entropy": 0.21462351083755493, "epoch": 2.565705128205128, "frac_reward_zero_std": 0.40625, "grad_norm": 820.033935546875, "learning_rate": 1e-06, "loss": 0.0379, "num_tokens": 1037399484.0, "reward": 0.3129954934120178, "reward_std": 0.025571607053279877, "rewards/progression_diversity/mean": -0.0007452977006323636, "rewards/progression_diversity/std": 0.01137583889067173, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.7347493171691895, "rewards/symbolic_reward_partial_score/std": 0.177503302693367, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0383716821670532, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.7996788024902344, "step": 1601 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21920231729745865, "epoch": 2.5673076923076925, "grad_norm": 0.01972138322889805, "learning_rate": 1e-06, "loss": 0.036, "step": 1602 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.21449632197618484, "epoch": 2.5689102564102564, "grad_norm": 0.015361227095127106, "learning_rate": 1e-06, "loss": 0.0557, "step": 1603 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.21294260025024414, "epoch": 2.5705128205128203, "grad_norm": 0.009509088471531868, "learning_rate": 1e-06, "loss": 0.0432, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5541.0, "completions/mean_length": 1318.08984375, "completions/mean_terminated_length": 1199.4605712890625, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.21971651911735535, "epoch": 2.5721153846153846, "frac_reward_zero_std": 0.46875, "grad_norm": 0.024938074871897697, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 1039039034.0, "reward": 0.29758530855178833, "reward_std": 0.030321484431624413, "rewards/progression_diversity/mean": -0.0017264732159674168, "rewards/progression_diversity/std": 0.027802200987935066, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.6899251341819763, "rewards/symbolic_reward_partial_score/std": 0.21471048891544342, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0427124500274658, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 752.0, "sampling/sampling_logp_difference/mean": 3.036716938018799, "step": 1605 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2219068706035614, "epoch": 2.573717948717949, "grad_norm": 0.012791654095053673, "learning_rate": 1e-06, "loss": 0.0082, "step": 1606 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.20912151038646698, "epoch": 2.5753205128205128, "grad_norm": 0.008532814681529999, "learning_rate": 1e-06, "loss": 0.033, "step": 1607 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.22284353524446487, "epoch": 2.5769230769230766, "grad_norm": 0.019355937838554382, "learning_rate": 1e-06, "loss": 0.0272, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4659.0, "completions/mean_length": 1479.640625, "completions/mean_terminated_length": 1302.9091796875, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.22081206738948822, "epoch": 2.578525641025641, "frac_reward_zero_std": 0.34375, "grad_norm": 0.018469780683517456, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 1040598594.0, "reward": 0.41613101959228516, "reward_std": 0.07369263470172882, "rewards/progression_diversity/mean": -0.0035969598684459925, "rewards/progression_diversity/std": 0.033173561096191406, "rewards/symbolic_reward_accuracy/mean": 0.306640625, "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, "rewards/symbolic_reward_partial_score/mean": 0.7778483033180237, "rewards/symbolic_reward_partial_score/std": 0.20071126520633698, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0395357608795166, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 756.0, "sampling/sampling_logp_difference/mean": 4.858417510986328, "step": 1609 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2339448481798172, "epoch": 2.5801282051282053, "grad_norm": 0.021145859733223915, "learning_rate": 1e-06, "loss": -0.0069, "step": 1610 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.22270727157592773, "epoch": 2.581730769230769, "grad_norm": 0.014868995174765587, "learning_rate": 1e-06, "loss": 0.0422, "step": 1611 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.21710411459207535, "epoch": 2.5833333333333335, "grad_norm": 1147.5833740234375, "learning_rate": 1e-06, "loss": 0.2004, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5854.0, "completions/mean_length": 1488.01953125, "completions/mean_terminated_length": 1311.387451171875, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.2146845906972885, "epoch": 2.5849358974358974, "frac_reward_zero_std": 0.28125, "grad_norm": 0.021604053676128387, "learning_rate": 1e-06, "loss": 0.0206, "num_tokens": 1042269212.0, "reward": 0.3228908181190491, "reward_std": 0.06467078626155853, "rewards/progression_diversity/mean": -0.0019355263793841004, "rewards/progression_diversity/std": 0.025548765435814857, "rewards/symbolic_reward_accuracy/mean": 0.16015625, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.7599608898162842, "rewards/symbolic_reward_partial_score/std": 0.1819555014371872, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0413014888763428, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.9018664360046387, "step": 1613 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.20722515881061554, "epoch": 2.5865384615384617, "grad_norm": 440.9079895019531, "learning_rate": 1e-06, "loss": 0.1145, "step": 1614 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.20744751393795013, "epoch": 2.5881410256410255, "grad_norm": 2084.2294921875, "learning_rate": 1e-06, "loss": 0.2864, "step": 1615 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.22113841027021408, "epoch": 2.58974358974359, "grad_norm": 0.01803417317569256, "learning_rate": 1e-06, "loss": -0.0001, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3889.0, "completions/mean_length": 1386.021484375, "completions/mean_terminated_length": 1297.624755859375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "entropy": 0.21602119505405426, "epoch": 2.5913461538461537, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0642000138759613, "learning_rate": 1e-06, "loss": 0.0316, "num_tokens": 1043874039.0, "reward": 0.2649605870246887, "reward_std": 0.02608659118413925, "rewards/progression_diversity/mean": -0.0019893196877092123, "rewards/progression_diversity/std": 0.021042335778474808, "rewards/symbolic_reward_accuracy/mean": 0.095703125, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.693164050579071, "rewards/symbolic_reward_partial_score/std": 0.17582449316978455, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0435926914215088, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.047320604324341, "step": 1617 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2237110733985901, "epoch": 2.592948717948718, "grad_norm": 0.022002186626195908, "learning_rate": 1e-06, "loss": -0.0024, "step": 1618 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.21986420452594757, "epoch": 2.594551282051282, "grad_norm": 0.01582951843738556, "learning_rate": 1e-06, "loss": 0.0343, "step": 1619 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.22045395523309708, "epoch": 2.5961538461538463, "grad_norm": 0.021329455077648163, "learning_rate": 1e-06, "loss": 0.0004, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2783.0, "completions/mean_length": 1341.115234375, "completions/mean_terminated_length": 1311.6771240234375, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.22125251591205597, "epoch": 2.59775641025641, "frac_reward_zero_std": 0.4375, "grad_norm": 0.018027139827609062, "learning_rate": 1e-06, "loss": 0.0207, "num_tokens": 1045449474.0, "reward": 0.36266857385635376, "reward_std": 0.047155484557151794, "rewards/progression_diversity/mean": -0.0007219260442070663, "rewards/progression_diversity/std": 0.00793998222798109, "rewards/symbolic_reward_accuracy/mean": 0.25390625, "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, "rewards/symbolic_reward_partial_score/mean": 0.7017577886581421, "rewards/symbolic_reward_partial_score/std": 0.2232026308774948, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0492632389068604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 0.9296596050262451, "step": 1621 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.22497374564409256, "epoch": 2.5993589743589745, "grad_norm": 0.01935565285384655, "learning_rate": 1e-06, "loss": 0.0026, "step": 1622 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22052931785583496, "epoch": 2.6009615384615383, "grad_norm": 0.007344152312725782, "learning_rate": 1e-06, "loss": 0.0077, "step": 1623 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21955804526805878, "epoch": 2.6025641025641026, "grad_norm": 0.013848908245563507, "learning_rate": 1e-06, "loss": 0.0017, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3666.0, "completions/mean_length": 1220.310546875, "completions/mean_terminated_length": 1190.635986328125, "completions/min_length": 414.0, "completions/min_terminated_length": 414.0, "entropy": 0.23253807425498962, "epoch": 2.6041666666666665, "frac_reward_zero_std": 0.46875, "grad_norm": 0.01905795931816101, "learning_rate": 1e-06, "loss": 0.0033, "num_tokens": 1046961025.0, "reward": 0.31768307089805603, "reward_std": 0.016084838658571243, "rewards/progression_diversity/mean": -0.0002500782429706305, "rewards/progression_diversity/std": 0.003307226812466979, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.6885091066360474, "rewards/symbolic_reward_partial_score/std": 0.20202499628067017, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0517390966415405, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 756.0, "sampling/sampling_logp_difference/mean": 0.775344729423523, "step": 1625 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2306702509522438, "epoch": 2.605769230769231, "grad_norm": 0.009856280870735645, "learning_rate": 1e-06, "loss": 0.0303, "step": 1626 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22651749849319458, "epoch": 2.6073717948717947, "grad_norm": 0.03335234895348549, "learning_rate": 1e-06, "loss": -0.003, "step": 1627 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.23627790808677673, "epoch": 2.608974358974359, "grad_norm": 0.01697065308690071, "learning_rate": 1e-06, "loss": -0.0023, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 10288.0, "completions/mean_length": 1333.587890625, "completions/mean_terminated_length": 1274.5667724609375, "completions/min_length": 498.0, "completions/min_terminated_length": 498.0, "entropy": 0.2252894639968872, "epoch": 2.6105769230769234, "frac_reward_zero_std": 0.375, "grad_norm": 0.10012649744749069, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 1048435198.0, "reward": 0.3364197015762329, "reward_std": 0.03154347091913223, "rewards/progression_diversity/mean": -0.0025622413959354162, "rewards/progression_diversity/std": 0.030175259336829185, "rewards/symbolic_reward_accuracy/mean": 0.208984375, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.7035156488418579, "rewards/symbolic_reward_partial_score/std": 0.19566431641578674, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0484051704406738, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.8174188137054443, "step": 1629 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.23386212438344955, "epoch": 2.6121794871794872, "grad_norm": 0.015600372105836868, "learning_rate": 1e-06, "loss": 0.0257, "step": 1630 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.23984410613775253, "epoch": 2.613782051282051, "grad_norm": 0.0189208984375, "learning_rate": 1e-06, "loss": -0.0074, "step": 1631 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.22813158482313156, "epoch": 2.6153846153846154, "grad_norm": 0.01574343629181385, "learning_rate": 1e-06, "loss": 0.0725, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3291.0, "completions/max_terminated_length": 3291.0, "completions/mean_length": 1187.220703125, "completions/mean_terminated_length": 1187.220703125, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "entropy": 0.23443499207496643, "epoch": 2.6169871794871797, "frac_reward_zero_std": 0.71875, "grad_norm": 0.015873905271291733, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 1049949615.0, "reward": 0.33012884855270386, "reward_std": 0.014471746981143951, "rewards/progression_diversity/mean": -0.0003017832641489804, "rewards/progression_diversity/std": 0.0035524494014680386, "rewards/symbolic_reward_accuracy/mean": 0.181640625, "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, "rewards/symbolic_reward_partial_score/mean": 0.7371581792831421, "rewards/symbolic_reward_partial_score/std": 0.1770060658454895, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0539133548736572, "sampling/importance_sampling_ratio/min": 0.0032137008383870125, "sampling/sampling_logp_difference/max": 5.740332126617432, "sampling/sampling_logp_difference/mean": 0.1061110571026802, "step": 1633 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2335355207324028, "epoch": 2.6185897435897436, "grad_norm": 0.008864963427186012, "learning_rate": 1e-06, "loss": 0.0012, "step": 1634 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.23033030331134796, "epoch": 2.6201923076923075, "grad_norm": 0.015018898993730545, "learning_rate": 1e-06, "loss": -0.0089, "step": 1635 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.23148179799318314, "epoch": 2.621794871794872, "grad_norm": 0.008959776721894741, "learning_rate": 1e-06, "loss": 0.0062, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 1215.390625, "completions/mean_terminated_length": 1155.906005859375, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "entropy": 0.2370157241821289, "epoch": 2.623397435897436, "frac_reward_zero_std": 0.5, "grad_norm": 0.01420600526034832, "learning_rate": 1e-06, "loss": 0.0079, "num_tokens": 1051439463.0, "reward": 0.3070603907108307, "reward_std": 0.03020370379090309, "rewards/progression_diversity/mean": -0.0019726285245269537, "rewards/progression_diversity/std": 0.030601099133491516, "rewards/symbolic_reward_accuracy/mean": 0.16796875, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.68896484375, "rewards/symbolic_reward_partial_score/std": 0.1941937357187271, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0498253107070923, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.9474784135818481, "step": 1637 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.23885345458984375, "epoch": 2.625, "grad_norm": 0.011274533346295357, "learning_rate": 1e-06, "loss": 0.024, "step": 1638 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.23263810575008392, "epoch": 2.626602564102564, "grad_norm": 0.008748682215809822, "learning_rate": 1e-06, "loss": 0.0306, "step": 1639 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.23605605959892273, "epoch": 2.628205128205128, "grad_norm": 0.020408501848578453, "learning_rate": 1e-06, "loss": -0.0049, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1151.375, "completions/mean_terminated_length": 1121.5655517578125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "entropy": 0.24552424252033234, "epoch": 2.6298076923076925, "frac_reward_zero_std": 0.5, "grad_norm": 0.022998787462711334, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 1052812775.0, "reward": 0.3075063228607178, "reward_std": 0.009556922130286694, "rewards/progression_diversity/mean": -0.0008343125809915364, "rewards/progression_diversity/std": 0.018091805279254913, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.7125488519668579, "rewards/symbolic_reward_partial_score/std": 0.18109886348247528, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0533249378204346, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.3622500896453857, "step": 1641 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2431017979979515, "epoch": 2.6314102564102564, "grad_norm": 0.019999193027615547, "learning_rate": 1e-06, "loss": 0.0205, "step": 1642 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24063821882009506, "epoch": 2.6330128205128203, "grad_norm": 0.01059049554169178, "learning_rate": 1e-06, "loss": -0.0094, "step": 1643 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.24199344217777252, "epoch": 2.6346153846153846, "grad_norm": 0.015171530656516552, "learning_rate": 1e-06, "loss": 0.0055, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1190.326171875, "completions/mean_terminated_length": 1160.5928955078125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.24322400987148285, "epoch": 2.636217948717949, "frac_reward_zero_std": 0.5, "grad_norm": 0.021913990378379822, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 1054322382.0, "reward": 0.30744144320487976, "reward_std": 0.010562664829194546, "rewards/progression_diversity/mean": -0.0009744351846165955, "rewards/progression_diversity/std": 0.020204655826091766, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.7129882574081421, "rewards/symbolic_reward_partial_score/std": 0.17659372091293335, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.053013801574707, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.5290266275405884, "step": 1645 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2419656366109848, "epoch": 2.6378205128205128, "grad_norm": 0.02146642841398716, "learning_rate": 1e-06, "loss": 0.0012, "step": 1646 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24547536671161652, "epoch": 2.6394230769230766, "grad_norm": 0.012392179109156132, "learning_rate": 1e-06, "loss": 0.0009, "step": 1647 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.23846372961997986, "epoch": 2.641025641025641, "grad_norm": 1201.439208984375, "learning_rate": 1e-06, "loss": 0.034, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 1209.61328125, "completions/mean_terminated_length": 1179.9178466796875, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "entropy": 0.24202610552310944, "epoch": 2.6426282051282053, "frac_reward_zero_std": 0.5, "grad_norm": 0.014862586744129658, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 1055858648.0, "reward": 0.3161856532096863, "reward_std": 0.019514337182044983, "rewards/progression_diversity/mean": -0.00106208142824471, "rewards/progression_diversity/std": 0.018535632640123367, "rewards/symbolic_reward_accuracy/mean": 0.19140625, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.6711751222610474, "rewards/symbolic_reward_partial_score/std": 0.2153313010931015, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0529862642288208, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.4675102233886719, "step": 1649 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2419712170958519, "epoch": 2.644230769230769, "grad_norm": 0.009646804071962833, "learning_rate": 1e-06, "loss": 0.0225, "step": 1650 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.24016830325126648, "epoch": 2.6458333333333335, "grad_norm": 0.02187509275972843, "learning_rate": 1e-06, "loss": -0.0017, "step": 1651 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2460290938615799, "epoch": 2.6474358974358974, "grad_norm": 0.018824264407157898, "learning_rate": 1e-06, "loss": -0.0001, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3980.0, "completions/max_terminated_length": 3980.0, "completions/mean_length": 1227.869140625, "completions/mean_terminated_length": 1227.869140625, "completions/min_length": 431.0, "completions/min_terminated_length": 431.0, "entropy": 0.24531883001327515, "epoch": 2.6490384615384617, "frac_reward_zero_std": 0.375, "grad_norm": 0.15125811100006104, "learning_rate": 1e-06, "loss": 0.0129, "num_tokens": 1057351989.0, "reward": 0.260877788066864, "reward_std": 0.027386680245399475, "rewards/progression_diversity/mean": -0.00011138351692352444, "rewards/progression_diversity/std": 0.0017274393467232585, "rewards/symbolic_reward_accuracy/mean": 0.09765625, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.6742838621139526, "rewards/symbolic_reward_partial_score/std": 0.16698527336120605, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0578720569610596, "sampling/importance_sampling_ratio/min": 0.0001738879072945565, "sampling/sampling_logp_difference/max": 8.657099723815918, "sampling/sampling_logp_difference/mean": 0.11238566040992737, "step": 1653 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2520115375518799, "epoch": 2.6506410256410255, "grad_norm": 0.016538454219698906, "learning_rate": 1e-06, "loss": -0.0015, "step": 1654 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.25293679535388947, "epoch": 2.65224358974359, "grad_norm": 0.020968111231923103, "learning_rate": 1e-06, "loss": -0.0085, "step": 1655 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.24919523298740387, "epoch": 2.6538461538461537, "grad_norm": 0.009686674922704697, "learning_rate": 1e-06, "loss": 0.0023, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 1314.712890625, "completions/mean_terminated_length": 1225.8958740234375, "completions/min_length": 391.0, "completions/min_terminated_length": 391.0, "entropy": 0.25128719210624695, "epoch": 2.655448717948718, "frac_reward_zero_std": 0.5, "grad_norm": 296.33270263671875, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 1058869346.0, "reward": 0.3539027273654938, "reward_std": 0.02425507828593254, "rewards/progression_diversity/mean": -0.00230794376693666, "rewards/progression_diversity/std": 0.02924177795648575, "rewards/symbolic_reward_accuracy/mean": 0.236328125, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.7077473998069763, "rewards/symbolic_reward_partial_score/std": 0.22301146388053894, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0505740642547607, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.750328540802002, "step": 1657 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24680255353450775, "epoch": 2.657051282051282, "grad_norm": 0.022292504087090492, "learning_rate": 1e-06, "loss": 0.0275, "step": 1658 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2595788687467575, "epoch": 2.6586538461538463, "grad_norm": 0.007560514844954014, "learning_rate": 1e-06, "loss": -0.0003, "step": 1659 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.25351230800151825, "epoch": 2.66025641025641, "grad_norm": 0.011664212681353092, "learning_rate": 1e-06, "loss": 0.0056, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2635.0, "completions/mean_length": 1181.86328125, "completions/mean_terminated_length": 1152.113525390625, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "entropy": 0.2603015750646591, "epoch": 2.6618589743589745, "frac_reward_zero_std": 0.53125, "grad_norm": 0.01761663518846035, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 1060424988.0, "reward": 0.3372478485107422, "reward_std": 0.02446237951517105, "rewards/progression_diversity/mean": -0.0007998401415534317, "rewards/progression_diversity/std": 0.01764107309281826, "rewards/symbolic_reward_accuracy/mean": 0.181640625, "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, "rewards/symbolic_reward_partial_score/mean": 0.7609050273895264, "rewards/symbolic_reward_partial_score/std": 0.18756793439388275, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0561124086380005, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.5122106075286865, "step": 1661 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2552846670150757, "epoch": 2.6634615384615383, "grad_norm": 0.009117074310779572, "learning_rate": 1e-06, "loss": 0.0035, "step": 1662 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.24909861385822296, "epoch": 2.6650641025641026, "grad_norm": 0.019102146849036217, "learning_rate": 1e-06, "loss": 0.0028, "step": 1663 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.25790928304195404, "epoch": 2.6666666666666665, "grad_norm": 0.010081185959279537, "learning_rate": 1e-06, "loss": -0.0069, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 1168.1640625, "completions/mean_terminated_length": 1168.1640625, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "entropy": 0.26866577565670013, "epoch": 2.668269230769231, "frac_reward_zero_std": 0.5625, "grad_norm": 0.02700965479016304, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 1061838736.0, "reward": 0.32009169459342957, "reward_std": 0.021595628932118416, "rewards/progression_diversity/mean": -0.00010825111530721188, "rewards/progression_diversity/std": 0.0018366762669757009, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.6919758915901184, "rewards/symbolic_reward_partial_score/std": 0.20659130811691284, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0602461099624634, "sampling/importance_sampling_ratio/min": 7.974762411322445e-05, "sampling/sampling_logp_difference/max": 9.436643600463867, "sampling/sampling_logp_difference/mean": 0.11774517595767975, "step": 1665 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2683710306882858, "epoch": 2.6698717948717947, "grad_norm": 0.02114093489944935, "learning_rate": 1e-06, "loss": -0.0047, "step": 1666 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.268236443400383, "epoch": 2.671474358974359, "grad_norm": 0.009307913482189178, "learning_rate": 1e-06, "loss": 0.0008, "step": 1667 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2630407512187958, "epoch": 2.6730769230769234, "grad_norm": 0.01072956994175911, "learning_rate": 1e-06, "loss": -0.0018, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 1157.5234375, "completions/mean_terminated_length": 1157.5234375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.26273058354854584, "epoch": 2.6746794871794872, "frac_reward_zero_std": 0.53125, "grad_norm": 0.022477174177765846, "learning_rate": 1e-06, "loss": 0.0039, "num_tokens": 1063326540.0, "reward": 0.3207314610481262, "reward_std": 0.03389011323451996, "rewards/progression_diversity/mean": -9.633351874072105e-05, "rewards/progression_diversity/std": 0.0015180562622845173, "rewards/symbolic_reward_accuracy/mean": 0.17578125, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.7175455689430237, "rewards/symbolic_reward_partial_score/std": 0.2033698707818985, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0595388412475586, "sampling/importance_sampling_ratio/min": 0.0003233875031583011, "sampling/sampling_logp_difference/max": 8.036659240722656, "sampling/sampling_logp_difference/mean": 0.11578333377838135, "step": 1669 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.26235876977443695, "epoch": 2.676282051282051, "grad_norm": 0.01767921820282936, "learning_rate": 1e-06, "loss": 0.0022, "step": 1670 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2589954137802124, "epoch": 2.6778846153846154, "grad_norm": 0.01393952313810587, "learning_rate": 1e-06, "loss": 0.0022, "step": 1671 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2695692628622055, "epoch": 2.6794871794871797, "grad_norm": 0.02042005956172943, "learning_rate": 1e-06, "loss": -0.0056, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 1089.64453125, "completions/mean_terminated_length": 1089.64453125, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "entropy": 0.2677207440137863, "epoch": 2.6810897435897436, "frac_reward_zero_std": 0.59375, "grad_norm": 0.010551492683589458, "learning_rate": 1e-06, "loss": -0.004, "num_tokens": 1064821670.0, "reward": 0.2943932116031647, "reward_std": 0.024308741092681885, "rewards/progression_diversity/mean": -0.00013180242967791855, "rewards/progression_diversity/std": 0.0023829475976526737, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.7156900763511658, "rewards/symbolic_reward_partial_score/std": 0.18010446429252625, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060936450958252, "sampling/importance_sampling_ratio/min": 0.002229129895567894, "sampling/sampling_logp_difference/max": 6.106143951416016, "sampling/sampling_logp_difference/mean": 0.11975084245204926, "step": 1673 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2699047178030014, "epoch": 2.6826923076923075, "grad_norm": 0.017179284244775772, "learning_rate": 1e-06, "loss": 0.0027, "step": 1674 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.26817895472049713, "epoch": 2.684294871794872, "grad_norm": 0.022273728623986244, "learning_rate": 1e-06, "loss": -0.0012, "step": 1675 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.27202919125556946, "epoch": 2.685897435897436, "grad_norm": 0.007613576482981443, "learning_rate": 1e-06, "loss": 0.0027, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2164.0, "completions/max_terminated_length": 2164.0, "completions/mean_length": 1106.2890625, "completions/mean_terminated_length": 1106.2890625, "completions/min_length": 440.0, "completions/min_terminated_length": 440.0, "entropy": 0.27295637130737305, "epoch": 2.6875, "frac_reward_zero_std": 0.4375, "grad_norm": 0.018397819250822067, "learning_rate": 1e-06, "loss": 0.0038, "num_tokens": 1066328586.0, "reward": 0.21672746539115906, "reward_std": 0.02070911042392254, "rewards/progression_diversity/mean": -0.00010542837844695896, "rewards/progression_diversity/std": 0.0020016725175082684, "rewards/symbolic_reward_accuracy/mean": 0.02734375, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.6677408814430237, "rewards/symbolic_reward_partial_score/std": 0.1652555614709854, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062442421913147, "sampling/importance_sampling_ratio/min": 0.002278130268678069, "sampling/sampling_logp_difference/max": 6.084400177001953, "sampling/sampling_logp_difference/mean": 0.12101897597312927, "step": 1677 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2777542769908905, "epoch": 2.689102564102564, "grad_norm": 0.009125451557338238, "learning_rate": 1e-06, "loss": 0.0035, "step": 1678 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.273501455783844, "epoch": 2.690705128205128, "grad_norm": 0.010436906479299068, "learning_rate": 1e-06, "loss": 0.0021, "step": 1679 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2719372361898422, "epoch": 2.6923076923076925, "grad_norm": 0.012276146560907364, "learning_rate": 1e-06, "loss": -0.0069, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 1127.42578125, "completions/mean_terminated_length": 1127.42578125, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.27352291345596313, "epoch": 2.6939102564102564, "frac_reward_zero_std": 0.59375, "grad_norm": 0.020503515377640724, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 1067652580.0, "reward": 0.39523687958717346, "reward_std": 0.03253398835659027, "rewards/progression_diversity/mean": -0.0002392895403318107, "rewards/progression_diversity/std": 0.004227474331855774, "rewards/symbolic_reward_accuracy/mean": 0.2578125, "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, "rewards/symbolic_reward_partial_score/mean": 0.8018391728401184, "rewards/symbolic_reward_partial_score/std": 0.16659002006053925, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0630683898925781, "sampling/importance_sampling_ratio/min": 0.0012960727326571941, "sampling/sampling_logp_difference/max": 6.648416519165039, "sampling/sampling_logp_difference/mean": 0.12232454121112823, "step": 1681 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2841361165046692, "epoch": 2.6955128205128203, "grad_norm": 0.013175196945667267, "learning_rate": 1e-06, "loss": 0.0021, "step": 1682 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2792322635650635, "epoch": 2.6971153846153846, "grad_norm": 0.008740157820284367, "learning_rate": 1e-06, "loss": -0.0037, "step": 1683 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2852344512939453, "epoch": 2.698717948717949, "grad_norm": 0.007328695617616177, "learning_rate": 1e-06, "loss": 0.0062, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2764.0, "completions/max_terminated_length": 2764.0, "completions/mean_length": 1182.7734375, "completions/mean_terminated_length": 1182.7734375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.27371834218502045, "epoch": 2.7003205128205128, "frac_reward_zero_std": 0.625, "grad_norm": 0.016033172607421875, "learning_rate": 1e-06, "loss": -0.0049, "num_tokens": 1069083168.0, "reward": 0.3871532678604126, "reward_std": 0.036548394709825516, "rewards/progression_diversity/mean": -8.401693776249886e-06, "rewards/progression_diversity/std": 0.00019010863616131246, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.7592610716819763, "rewards/symbolic_reward_partial_score/std": 0.18804891407489777, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0619139671325684, "sampling/importance_sampling_ratio/min": 6.210852006915957e-05, "sampling/sampling_logp_difference/max": 9.686627388000488, "sampling/sampling_logp_difference/mean": 0.1205776035785675, "step": 1685 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.27223040163517, "epoch": 2.7019230769230766, "grad_norm": 0.009487297385931015, "learning_rate": 1e-06, "loss": 0.0021, "step": 1686 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2768988460302353, "epoch": 2.703525641025641, "grad_norm": 0.010462704114615917, "learning_rate": 1e-06, "loss": 0.0031, "step": 1687 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2717527002096176, "epoch": 2.7051282051282053, "grad_norm": 0.016962876543402672, "learning_rate": 1e-06, "loss": 0.003, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 1154.96875, "completions/mean_terminated_length": 1154.96875, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "entropy": 0.2763988673686981, "epoch": 2.706730769230769, "frac_reward_zero_std": 0.59375, "grad_norm": 0.011929732747375965, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 1070536032.0, "reward": 0.3324165344238281, "reward_std": 0.020372124388813972, "rewards/progression_diversity/mean": -4.5338445488596335e-05, "rewards/progression_diversity/std": 0.0006139642791822553, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.7408691644668579, "rewards/symbolic_reward_partial_score/std": 0.18128949403762817, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0632294416427612, "sampling/importance_sampling_ratio/min": 0.0005881256074644625, "sampling/sampling_logp_difference/max": 7.438570022583008, "sampling/sampling_logp_difference/mean": 0.12137885391712189, "step": 1689 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.27255162596702576, "epoch": 2.7083333333333335, "grad_norm": 0.02167939394712448, "learning_rate": 1e-06, "loss": 0.0019, "step": 1690 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.28303536772727966, "epoch": 2.7099358974358974, "grad_norm": 0.008452474139630795, "learning_rate": 1e-06, "loss": -0.0025, "step": 1691 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2690129578113556, "epoch": 2.7115384615384617, "grad_norm": 0.010433075949549675, "learning_rate": 1e-06, "loss": 0.0047, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 1084.271484375, "completions/mean_terminated_length": 1054.3306884765625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.2840660959482193, "epoch": 2.7131410256410255, "frac_reward_zero_std": 0.65625, "grad_norm": 0.007845147512853146, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 1071857355.0, "reward": 0.41788747906684875, "reward_std": 0.011897813528776169, "rewards/progression_diversity/mean": -0.0008051774930208921, "rewards/progression_diversity/std": 0.01665707863867283, "rewards/symbolic_reward_accuracy/mean": 0.310546875, "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, "rewards/symbolic_reward_partial_score/mean": 0.7718912363052368, "rewards/symbolic_reward_partial_score/std": 0.1966981589794159, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0603790283203125, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.6770976781845093, "step": 1693 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.27424998581409454, "epoch": 2.71474358974359, "grad_norm": 0.011158179491758347, "learning_rate": 1e-06, "loss": 0.0195, "step": 1694 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.279265433549881, "epoch": 2.7163461538461537, "grad_norm": 0.012044363655149937, "learning_rate": 1e-06, "loss": 0.0047, "step": 1695 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.27725885808467865, "epoch": 2.717948717948718, "grad_norm": 0.004128993488848209, "learning_rate": 1e-06, "loss": -0.0004, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 1174.107421875, "completions/mean_terminated_length": 1084.461669921875, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "entropy": 0.2545585185289383, "epoch": 2.719551282051282, "frac_reward_zero_std": 0.53125, "grad_norm": 0.01764233596622944, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 1073390098.0, "reward": 0.3306075632572174, "reward_std": 0.017754849046468735, "rewards/progression_diversity/mean": -0.002232456346973777, "rewards/progression_diversity/std": 0.02855554409325123, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.731005847454071, "rewards/symbolic_reward_partial_score/std": 0.17919674515724182, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0483648777008057, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 5.032663345336914, "step": 1697 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2522525489330292, "epoch": 2.7211538461538463, "grad_norm": 0.014643147587776184, "learning_rate": 1e-06, "loss": -0.0005, "step": 1698 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.23972028493881226, "epoch": 2.72275641025641, "grad_norm": 0.014284703880548477, "learning_rate": 1e-06, "loss": 0.0231, "step": 1699 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2528829574584961, "epoch": 2.7243589743589745, "grad_norm": 0.007763241417706013, "learning_rate": 1e-06, "loss": 0.0057, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 1085.763671875, "completions/mean_terminated_length": 1055.8258056640625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "entropy": 0.25211136788129807, "epoch": 2.7259615384615383, "frac_reward_zero_std": 0.625, "grad_norm": 0.01731860637664795, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 1074838889.0, "reward": 0.37423521280288696, "reward_std": 0.012984178960323334, "rewards/progression_diversity/mean": -0.0007978131761774421, "rewards/progression_diversity/std": 0.017717914655804634, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.7552896738052368, "rewards/symbolic_reward_partial_score/std": 0.20005594193935394, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0550222396850586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.6980559825897217, "step": 1701 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.24782519042491913, "epoch": 2.7275641025641026, "grad_norm": 0.01143115945160389, "learning_rate": 1e-06, "loss": 0.001, "step": 1702 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.25386030972003937, "epoch": 2.7291666666666665, "grad_norm": 0.010584347881376743, "learning_rate": 1e-06, "loss": -0.001, "step": 1703 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.25097061693668365, "epoch": 2.730769230769231, "grad_norm": 0.014512878842651844, "learning_rate": 1e-06, "loss": -0.0028, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1766.0, "completions/mean_length": 1077.181640625, "completions/mean_terminated_length": 1017.1549682617188, "completions/min_length": 387.0, "completions/min_terminated_length": 387.0, "entropy": 0.2463693991303444, "epoch": 2.7323717948717947, "frac_reward_zero_std": 0.53125, "grad_norm": 0.022319326177239418, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 1076451366.0, "reward": 0.3169002830982208, "reward_std": 0.018300164490938187, "rewards/progression_diversity/mean": -0.0013787832576781511, "rewards/progression_diversity/std": 0.02173839509487152, "rewards/symbolic_reward_accuracy/mean": 0.16796875, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.7204427123069763, "rewards/symbolic_reward_partial_score/std": 0.20151053369045258, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0520621538162231, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.0996155738830566, "step": 1705 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24724167585372925, "epoch": 2.733974358974359, "grad_norm": 465.19256591796875, "learning_rate": 1e-06, "loss": 0.0165, "step": 1706 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.25137076526880264, "epoch": 2.7355769230769234, "grad_norm": 0.011684092693030834, "learning_rate": 1e-06, "loss": 0.0032, "step": 1707 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.24099694192409515, "epoch": 2.7371794871794872, "grad_norm": 0.01180903147906065, "learning_rate": 1e-06, "loss": -0.0007, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 1115.263671875, "completions/mean_terminated_length": 1025.2711181640625, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "entropy": 0.24923396110534668, "epoch": 2.738782051282051, "frac_reward_zero_std": 0.59375, "grad_norm": 0.022832239046692848, "learning_rate": 1e-06, "loss": -0.0048, "num_tokens": 1077919117.0, "reward": 0.4506452679634094, "reward_std": 0.006873677484691143, "rewards/progression_diversity/mean": -0.002368855057284236, "rewards/progression_diversity/std": 0.02794456295669079, "rewards/symbolic_reward_accuracy/mean": 0.34375, "rewards/symbolic_reward_accuracy/std": 0.4754233956336975, "rewards/symbolic_reward_partial_score/mean": 0.8147298097610474, "rewards/symbolic_reward_partial_score/std": 0.17687095701694489, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0485377311706543, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 4.422557353973389, "step": 1709 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.24275388568639755, "epoch": 2.7403846153846154, "grad_norm": 0.020597660914063454, "learning_rate": 1e-06, "loss": 0.0452, "step": 1710 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.25289829075336456, "epoch": 2.7419871794871797, "grad_norm": 0.005713851656764746, "learning_rate": 1e-06, "loss": -0.0012, "step": 1711 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2434333637356758, "epoch": 2.7435897435897436, "grad_norm": 0.010084899142384529, "learning_rate": 1e-06, "loss": 0.0226, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1117.982421875, "completions/mean_terminated_length": 1088.107666015625, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "entropy": 0.24176087975502014, "epoch": 2.7451923076923075, "frac_reward_zero_std": 0.53125, "grad_norm": 0.014847025275230408, "learning_rate": 1e-06, "loss": -0.023, "num_tokens": 1079362852.0, "reward": 0.4161057472229004, "reward_std": 0.020435160025954247, "rewards/progression_diversity/mean": -0.0012444315943866968, "rewards/progression_diversity/std": 0.017949236556887627, "rewards/symbolic_reward_accuracy/mean": 0.302734375, "rewards/symbolic_reward_accuracy/std": 0.45989060401916504, "rewards/symbolic_reward_partial_score/mean": 0.7815917730331421, "rewards/symbolic_reward_partial_score/std": 0.19830654561519623, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.052306890487671, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.7721519470214844, "step": 1713 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2418888807296753, "epoch": 2.746794871794872, "grad_norm": 0.005131000652909279, "learning_rate": 1e-06, "loss": -0.0005, "step": 1714 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24066882580518723, "epoch": 2.748397435897436, "grad_norm": 0.012241684831678867, "learning_rate": 1e-06, "loss": 0.0029, "step": 1715 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2462095245718956, "epoch": 2.75, "grad_norm": 0.011896181851625443, "learning_rate": 1e-06, "loss": -0.0003, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 1122.0, "completions/mean_terminated_length": 1062.1490478515625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.24476610869169235, "epoch": 2.751602564102564, "frac_reward_zero_std": 0.53125, "grad_norm": 0.011216786690056324, "learning_rate": 1e-06, "loss": -0.0053, "num_tokens": 1080775556.0, "reward": 0.31833428144454956, "reward_std": 0.009690433740615845, "rewards/progression_diversity/mean": -0.0015309633454307914, "rewards/progression_diversity/std": 0.02212413214147091, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.7486653923988342, "rewards/symbolic_reward_partial_score/std": 0.18320314586162567, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.050942063331604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.9385156631469727, "step": 1717 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.23835279792547226, "epoch": 2.753205128205128, "grad_norm": 0.012194461189210415, "learning_rate": 1e-06, "loss": 0.232, "step": 1718 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.24551533162593842, "epoch": 2.7548076923076925, "grad_norm": 0.015713347122073174, "learning_rate": 1e-06, "loss": 0.0023, "step": 1719 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.25484059751033783, "epoch": 2.7564102564102564, "grad_norm": 0.01567044109106064, "learning_rate": 1e-06, "loss": -0.001, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2034.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 970.169921875, "completions/mean_terminated_length": 970.169921875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "entropy": 0.249983012676239, "epoch": 2.7580128205128203, "frac_reward_zero_std": 0.78125, "grad_norm": 0.010865814052522182, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 1082279163.0, "reward": 0.2930664122104645, "reward_std": 0.005064056254923344, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.7268880009651184, "rewards/symbolic_reward_partial_score/std": 0.17721883952617645, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0566366910934448, "sampling/importance_sampling_ratio/min": 0.0014420977095142007, "sampling/sampling_logp_difference/max": 6.541656494140625, "sampling/sampling_logp_difference/mean": 0.11134675145149231, "step": 1721 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.25236815214157104, "epoch": 2.7596153846153846, "grad_norm": 0.0043575153686106205, "learning_rate": 1e-06, "loss": -0.0021, "step": 1722 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.25090380012989044, "epoch": 2.761217948717949, "grad_norm": 0.00688148895278573, "learning_rate": 1e-06, "loss": -0.0008, "step": 1723 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2414015531539917, "epoch": 2.7628205128205128, "grad_norm": 0.0044527724385261536, "learning_rate": 1e-06, "loss": 0.002, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1888.0, "completions/mean_length": 1074.3984375, "completions/mean_terminated_length": 1044.4383544921875, "completions/min_length": 373.0, "completions/min_terminated_length": 373.0, "entropy": 0.26541852951049805, "epoch": 2.7644230769230766, "frac_reward_zero_std": 0.65625, "grad_norm": 0.007640301249921322, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 1083616935.0, "reward": 0.39704516530036926, "reward_std": 0.01641342230141163, "rewards/progression_diversity/mean": -0.0005633344408124685, "rewards/progression_diversity/std": 0.01171040628105402, "rewards/symbolic_reward_accuracy/mean": 0.2890625, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.7453775405883789, "rewards/symbolic_reward_partial_score/std": 0.20832401514053345, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0565519332885742, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.417606234550476, "step": 1725 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.26599733531475067, "epoch": 2.766025641025641, "grad_norm": 0.016951909288764, "learning_rate": 1e-06, "loss": -0.002, "step": 1726 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.25999362766742706, "epoch": 2.7676282051282053, "grad_norm": 1.5784918069839478, "learning_rate": 1e-06, "loss": 0.0046, "step": 1727 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2638109028339386, "epoch": 2.769230769230769, "grad_norm": 0.009875231422483921, "learning_rate": 1e-06, "loss": -0.0014, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 1121.234375, "completions/mean_terminated_length": 1061.3804931640625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "entropy": 0.2469266802072525, "epoch": 2.7708333333333335, "frac_reward_zero_std": 0.625, "grad_norm": 0.012807636521756649, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 1085117135.0, "reward": 0.3859965205192566, "reward_std": 0.007197174243628979, "rewards/progression_diversity/mean": -0.0014215593691915274, "rewards/progression_diversity/std": 0.021102240309119225, "rewards/symbolic_reward_accuracy/mean": 0.28125, "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, "rewards/symbolic_reward_partial_score/mean": 0.7242025136947632, "rewards/symbolic_reward_partial_score/std": 0.20905202627182007, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0523045063018799, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 2.6319713592529297, "step": 1729 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2536063939332962, "epoch": 2.7724358974358974, "grad_norm": 0.009693522937595844, "learning_rate": 1e-06, "loss": -0.0024, "step": 1730 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.24922660738229752, "epoch": 2.7740384615384617, "grad_norm": 0.0067869494669139385, "learning_rate": 1e-06, "loss": 0.0263, "step": 1731 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.25212302803993225, "epoch": 2.7756410256410255, "grad_norm": 0.003368781413882971, "learning_rate": 1e-06, "loss": -0.0037, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1915.0, "completions/mean_length": 1387.89453125, "completions/mean_terminated_length": 1149.8612060546875, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "entropy": 0.2443920373916626, "epoch": 2.77724358974359, "frac_reward_zero_std": 0.5, "grad_norm": 510.38726806640625, "learning_rate": 1e-06, "loss": 0.0371, "num_tokens": 1086770873.0, "reward": 0.28071457147598267, "reward_std": 0.026247015222907066, "rewards/progression_diversity/mean": -0.0052025578916072845, "rewards/progression_diversity/std": 0.04079189524054527, "rewards/symbolic_reward_accuracy/mean": 0.115234375, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.7054198980331421, "rewards/symbolic_reward_partial_score/std": 0.18301890790462494, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0395216941833496, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 8.083383560180664, "step": 1733 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.23890351504087448, "epoch": 2.7788461538461537, "grad_norm": 0.005275333765894175, "learning_rate": 1e-06, "loss": 0.0321, "step": 1734 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2496597245335579, "epoch": 2.780448717948718, "grad_norm": 0.018677234649658203, "learning_rate": 1e-06, "loss": -0.0038, "step": 1735 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.23257097601890564, "epoch": 2.782051282051282, "grad_norm": 0.006251712329685688, "learning_rate": 1e-06, "loss": 0.0448, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 1218.24609375, "completions/mean_terminated_length": 1128.860595703125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "entropy": 0.25235116481781006, "epoch": 2.7836538461538463, "frac_reward_zero_std": 0.5625, "grad_norm": 480.960693359375, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 1088287719.0, "reward": 0.30648308992385864, "reward_std": 0.03177080303430557, "rewards/progression_diversity/mean": -0.0020833953749388456, "rewards/progression_diversity/std": 0.02671230398118496, "rewards/symbolic_reward_accuracy/mean": 0.150390625, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.7208983898162842, "rewards/symbolic_reward_partial_score/std": 0.19757305085659027, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0507006645202637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.8391146659851074, "step": 1737 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2659503221511841, "epoch": 2.78525641025641, "grad_norm": 0.006545082200318575, "learning_rate": 1e-06, "loss": 0.0058, "step": 1738 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.24851246178150177, "epoch": 2.7868589743589745, "grad_norm": 0.011122615076601505, "learning_rate": 1e-06, "loss": 0.0275, "step": 1739 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.26005537807941437, "epoch": 2.7884615384615383, "grad_norm": 0.01145921926945448, "learning_rate": 1e-06, "loss": -0.0071, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 1462.2109375, "completions/mean_terminated_length": 1164.9642333984375, "completions/min_length": 367.0, "completions/min_terminated_length": 367.0, "entropy": 0.24173830449581146, "epoch": 2.7900641025641026, "frac_reward_zero_std": 0.375, "grad_norm": 95.36475372314453, "learning_rate": 1e-06, "loss": 0.0115, "num_tokens": 1089973347.0, "reward": 0.2687704563140869, "reward_std": 0.03094436414539814, "rewards/progression_diversity/mean": -0.007230142131447792, "rewards/progression_diversity/std": 0.051848724484443665, "rewards/symbolic_reward_accuracy/mean": 0.08984375, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.7164551019668579, "rewards/symbolic_reward_partial_score/std": 0.19291111826896667, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.028411865234375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 14.061389923095703, "step": 1741 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24212167412042618, "epoch": 2.7916666666666665, "grad_norm": 0.023332608863711357, "learning_rate": 1e-06, "loss": 0.0322, "step": 1742 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.24151303619146347, "epoch": 2.793269230769231, "grad_norm": 0.02178528904914856, "learning_rate": 1e-06, "loss": 0.0041, "step": 1743 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2449488863348961, "epoch": 2.7948717948717947, "grad_norm": 0.00785065721720457, "learning_rate": 1e-06, "loss": 0.0433, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2171.0, "completions/mean_length": 1198.900390625, "completions/mean_terminated_length": 1139.35107421875, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 0.2584023177623749, "epoch": 2.796474358974359, "frac_reward_zero_std": 0.53125, "grad_norm": 0.017909616231918335, "learning_rate": 1e-06, "loss": 0.0197, "num_tokens": 1091436528.0, "reward": 0.4526541233062744, "reward_std": 0.034945450723171234, "rewards/progression_diversity/mean": -0.0016795285046100616, "rewards/progression_diversity/std": 0.023316482082009315, "rewards/symbolic_reward_accuracy/mean": 0.37109375, "rewards/symbolic_reward_accuracy/std": 0.4835699498653412, "rewards/symbolic_reward_partial_score/mean": 0.7667155265808105, "rewards/symbolic_reward_partial_score/std": 0.22165250778198242, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0529496669769287, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 3.4778664112091064, "step": 1745 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.25567905604839325, "epoch": 2.7980769230769234, "grad_norm": 1.9885333776474, "learning_rate": 1e-06, "loss": -0.001, "step": 1746 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.26071760058403015, "epoch": 2.7996794871794872, "grad_norm": 0.01138362381607294, "learning_rate": 1e-06, "loss": 0.0018, "step": 1747 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2582208067178726, "epoch": 2.801282051282051, "grad_norm": 0.01637045480310917, "learning_rate": 1e-06, "loss": 0.0035, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 1168.337890625, "completions/mean_terminated_length": 1078.658203125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 0.26668301224708557, "epoch": 2.8028846153846154, "frac_reward_zero_std": 0.6875, "grad_norm": 0.014873064123094082, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 1092912237.0, "reward": 0.3330345153808594, "reward_std": 0.009103953838348389, "rewards/progression_diversity/mean": -0.0026997888926416636, "rewards/progression_diversity/std": 0.03452081233263016, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.7358561158180237, "rewards/symbolic_reward_partial_score/std": 0.1908014863729477, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0509870052337646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 4.842784881591797, "step": 1749 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.26296547055244446, "epoch": 2.8044871794871797, "grad_norm": 0.014554469846189022, "learning_rate": 1e-06, "loss": 0.0075, "step": 1750 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.25916317105293274, "epoch": 2.8060897435897436, "grad_norm": 0.012337586842477322, "learning_rate": 1e-06, "loss": 0.0315, "step": 1751 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2664582431316376, "epoch": 2.8076923076923075, "grad_norm": 0.006437097210437059, "learning_rate": 1e-06, "loss": -0.0017, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1078.322265625, "completions/mean_terminated_length": 1078.322265625, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "entropy": 0.26388947665691376, "epoch": 2.809294871794872, "frac_reward_zero_std": 0.6875, "grad_norm": 0.024757713079452515, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 1094326178.0, "reward": 0.39402249455451965, "reward_std": 0.020639346912503242, "rewards/progression_diversity/mean": -9.485271584708244e-05, "rewards/progression_diversity/std": 0.0016233575297519565, "rewards/symbolic_reward_accuracy/mean": 0.275390625, "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, "rewards/symbolic_reward_partial_score/mean": 0.7626302242279053, "rewards/symbolic_reward_partial_score/std": 0.1950385868549347, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0604701042175293, "sampling/importance_sampling_ratio/min": 0.00016087631229311228, "sampling/sampling_logp_difference/max": 8.734874725341797, "sampling/sampling_logp_difference/mean": 0.11699830740690231, "step": 1753 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.26401379704475403, "epoch": 2.810897435897436, "grad_norm": 0.006932654418051243, "learning_rate": 1e-06, "loss": 0.0043, "step": 1754 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2620585560798645, "epoch": 2.8125, "grad_norm": 0.00885852426290512, "learning_rate": 1e-06, "loss": 0.0047, "step": 1755 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.26447711884975433, "epoch": 2.814102564102564, "grad_norm": 0.008724554441869259, "learning_rate": 1e-06, "loss": -0.005, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 1278.806640625, "completions/mean_terminated_length": 1099.6937255859375, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "entropy": 0.2583252787590027, "epoch": 2.815705128205128, "frac_reward_zero_std": 0.5625, "grad_norm": 0.008783966302871704, "learning_rate": 1e-06, "loss": 0.0674, "num_tokens": 1095875567.0, "reward": 0.38711655139923096, "reward_std": 0.0257473886013031, "rewards/progression_diversity/mean": -0.006121981889009476, "rewards/progression_diversity/std": 0.05701431632041931, "rewards/symbolic_reward_accuracy/mean": 0.27734375, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.7378580570220947, "rewards/symbolic_reward_partial_score/std": 0.21705490350723267, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0447036027908325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 7.182675838470459, "step": 1757 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.273300439119339, "epoch": 2.8173076923076925, "grad_norm": 0.01463186927139759, "learning_rate": 1e-06, "loss": -0.0055, "step": 1758 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2602022588253021, "epoch": 2.8189102564102564, "grad_norm": 0.013082200661301613, "learning_rate": 1e-06, "loss": 0.0554, "step": 1759 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.26440712809562683, "epoch": 2.8205128205128203, "grad_norm": 0.010828658938407898, "learning_rate": 1e-06, "loss": 0.0194, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2079.0, "completions/mean_length": 1091.5390625, "completions/mean_terminated_length": 1061.612548828125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "entropy": 0.28021863102912903, "epoch": 2.8221153846153846, "frac_reward_zero_std": 0.65625, "grad_norm": 0.00968155823647976, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 1097272771.0, "reward": 0.33711838722229004, "reward_std": 0.00528107350692153, "rewards/progression_diversity/mean": -0.0010524296667426825, "rewards/progression_diversity/std": 0.02093338593840599, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.7487630248069763, "rewards/symbolic_reward_partial_score/std": 0.18481901288032532, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0589430332183838, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 1.9359979629516602, "step": 1761 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2707153856754303, "epoch": 2.823717948717949, "grad_norm": 0.007914133369922638, "learning_rate": 1e-06, "loss": 0.001, "step": 1762 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2671675831079483, "epoch": 2.8253205128205128, "grad_norm": 0.016453703865408897, "learning_rate": 1e-06, "loss": 0.4248, "step": 1763 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.27130477130413055, "epoch": 2.8269230769230766, "grad_norm": 0.009142329916357994, "learning_rate": 1e-06, "loss": -0.0026, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1321.56640625, "completions/mean_terminated_length": 1052.0595703125, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.25689977407455444, "epoch": 2.828525641025641, "frac_reward_zero_std": 0.40625, "grad_norm": 857.88134765625, "learning_rate": 1e-06, "loss": 0.0615, "num_tokens": 1098881909.0, "reward": 0.3542296588420868, "reward_std": 0.029991699382662773, "rewards/progression_diversity/mean": -0.008187920786440372, "rewards/progression_diversity/std": 0.060944657772779465, "rewards/symbolic_reward_accuracy/mean": 0.234375, "rewards/symbolic_reward_accuracy/std": 0.42402184009552, "rewards/symbolic_reward_partial_score/mean": 0.712939441204071, "rewards/symbolic_reward_partial_score/std": 0.2108824998140335, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0325154066085815, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 13.213213920593262, "step": 1765 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2575305104255676, "epoch": 2.8301282051282053, "grad_norm": 0.018781762570142746, "learning_rate": 1e-06, "loss": 0.0015, "step": 1766 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2592518925666809, "epoch": 2.831730769230769, "grad_norm": 0.010047774761915207, "learning_rate": 1e-06, "loss": 0.0387, "step": 1767 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2506992667913437, "epoch": 2.8333333333333335, "grad_norm": 0.012858088128268719, "learning_rate": 1e-06, "loss": 0.0435, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1814.0, "completions/mean_length": 1162.033203125, "completions/mean_terminated_length": 1072.3162841796875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "entropy": 0.25953665375709534, "epoch": 2.8349358974358974, "frac_reward_zero_std": 0.625, "grad_norm": 402.998046875, "learning_rate": 1e-06, "loss": 0.0247, "num_tokens": 1100398246.0, "reward": 0.3336543142795563, "reward_std": 0.019593775272369385, "rewards/progression_diversity/mean": -0.002245709067210555, "rewards/progression_diversity/std": 0.029325280338525772, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.7450683116912842, "rewards/symbolic_reward_partial_score/std": 0.1828693449497223, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0506747961044312, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 4.43061637878418, "step": 1769 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2642872482538223, "epoch": 2.8365384615384617, "grad_norm": 0.010171863250434399, "learning_rate": 1e-06, "loss": -0.0029, "step": 1770 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.26166392862796783, "epoch": 2.8381410256410255, "grad_norm": 0.009490652941167355, "learning_rate": 1e-06, "loss": 0.0014, "step": 1771 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2598698139190674, "epoch": 2.83974358974359, "grad_norm": 0.010799145326018333, "learning_rate": 1e-06, "loss": 0.0263, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1231.958984375, "completions/mean_terminated_length": 1052.29052734375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "entropy": 0.2563736140727997, "epoch": 2.8413461538461537, "frac_reward_zero_std": 0.46875, "grad_norm": 0.015560018830001354, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 1102002417.0, "reward": 0.27941030263900757, "reward_std": 0.020234711468219757, "rewards/progression_diversity/mean": -0.005260556936264038, "rewards/progression_diversity/std": 0.04788186773657799, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.6821939945220947, "rewards/symbolic_reward_partial_score/std": 0.20531851053237915, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0375680923461914, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 9.631587028503418, "step": 1773 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.23910512030124664, "epoch": 2.842948717948718, "grad_norm": 0.008363538421690464, "learning_rate": 1e-06, "loss": 0.0743, "step": 1774 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.24283453822135925, "epoch": 2.844551282051282, "grad_norm": 49193.4765625, "learning_rate": 1e-06, "loss": 1.9294, "step": 1775 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2511303126811981, "epoch": 2.8461538461538463, "grad_norm": 0.016176847741007805, "learning_rate": 1e-06, "loss": -0.005, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 1422.087890625, "completions/mean_terminated_length": 1124.0418701171875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.2574124187231064, "epoch": 2.84775641025641, "frac_reward_zero_std": 0.4375, "grad_norm": 261.94879150390625, "learning_rate": 1e-06, "loss": -0.0031, "num_tokens": 1103595310.0, "reward": 0.3236244320869446, "reward_std": 0.023357607424259186, "rewards/progression_diversity/mean": -0.0086508272215724, "rewards/progression_diversity/std": 0.062096692621707916, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.7079427242279053, "rewards/symbolic_reward_partial_score/std": 0.21444584429264069, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.028921127319336, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 764.0, "sampling/sampling_logp_difference/mean": 14.721025466918945, "step": 1777 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2630213350057602, "epoch": 2.8493589743589745, "grad_norm": 0.015530991367995739, "learning_rate": 1e-06, "loss": 0.0129, "step": 1778 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.25482146441936493, "epoch": 2.8509615384615383, "grad_norm": 2.474681854248047, "learning_rate": 1e-06, "loss": 0.0207, "step": 1779 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.24518878757953644, "epoch": 2.8525641025641026, "grad_norm": 122.15824890136719, "learning_rate": 1e-06, "loss": 0.073, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 1393.974609375, "completions/mean_terminated_length": 1125.763427734375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "entropy": 0.24117282778024673, "epoch": 2.8541666666666665, "frac_reward_zero_std": 0.4375, "grad_norm": 894.6455688476562, "learning_rate": 1e-06, "loss": 0.0781, "num_tokens": 1105132289.0, "reward": 0.3265843987464905, "reward_std": 0.029642684385180473, "rewards/progression_diversity/mean": -0.007576026022434235, "rewards/progression_diversity/std": 0.05679089576005936, "rewards/symbolic_reward_accuracy/mean": 0.181640625, "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, "rewards/symbolic_reward_partial_score/mean": 0.7262369394302368, "rewards/symbolic_reward_partial_score/std": 0.19816945493221283, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0353561639785767, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 11.549863815307617, "step": 1781 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.26401175558567047, "epoch": 2.855769230769231, "grad_norm": 0.0201826523989439, "learning_rate": 1e-06, "loss": -0.0045, "step": 1782 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2670938968658447, "epoch": 2.8573717948717947, "grad_norm": 0.00789049081504345, "learning_rate": 1e-06, "loss": 0.0029, "step": 1783 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2526419162750244, "epoch": 2.858974358974359, "grad_norm": 5.186265468597412, "learning_rate": 1e-06, "loss": 0.0221, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 1483.630859375, "completions/mean_terminated_length": 1186.810791015625, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.2505309730768204, "epoch": 2.8605769230769234, "frac_reward_zero_std": 0.40625, "grad_norm": 22.291501998901367, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 1106690644.0, "reward": 0.33161357045173645, "reward_std": 0.03396005928516388, "rewards/progression_diversity/mean": -0.008566973730921745, "rewards/progression_diversity/std": 0.061884794384241104, "rewards/symbolic_reward_accuracy/mean": 0.17578125, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.7560546398162842, "rewards/symbolic_reward_partial_score/std": 0.18065473437309265, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0316046476364136, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 12.537492752075195, "step": 1785 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.23839128017425537, "epoch": 2.8621794871794872, "grad_norm": 0.012131825089454651, "learning_rate": 1e-06, "loss": 0.1023, "step": 1786 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2580532729625702, "epoch": 2.863782051282051, "grad_norm": 0.007426050491631031, "learning_rate": 1e-06, "loss": 0.0244, "step": 1787 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.258169487118721, "epoch": 2.8653846153846154, "grad_norm": 0.017984015867114067, "learning_rate": 1e-06, "loss": -0.003, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 1486.4765625, "completions/mean_terminated_length": 1189.713134765625, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "entropy": 0.23781320452690125, "epoch": 2.8669871794871797, "frac_reward_zero_std": 0.46875, "grad_norm": 482.7618713378906, "learning_rate": 1e-06, "loss": 0.0416, "num_tokens": 1108318152.0, "reward": 0.32290422916412354, "reward_std": 0.04777061939239502, "rewards/progression_diversity/mean": -0.008405622094869614, "rewards/progression_diversity/std": 0.060591381043195724, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.765429675579071, "rewards/symbolic_reward_partial_score/std": 0.16949279606342316, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0300309658050537, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 12.117807388305664, "step": 1789 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.24075324833393097, "epoch": 2.8685897435897436, "grad_norm": 0.012247085571289062, "learning_rate": 1e-06, "loss": 0.0271, "step": 1790 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.23328307271003723, "epoch": 2.8701923076923075, "grad_norm": 0.010141503997147083, "learning_rate": 1e-06, "loss": 0.0239, "step": 1791 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.23753970116376877, "epoch": 2.871794871794872, "grad_norm": 0.014962738379836082, "learning_rate": 1e-06, "loss": 0.0154, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 1426.01953125, "completions/mean_terminated_length": 1188.59130859375, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 0.23268520832061768, "epoch": 2.873397435897436, "frac_reward_zero_std": 0.375, "grad_norm": 389.8380432128906, "learning_rate": 1e-06, "loss": 0.042, "num_tokens": 1109934338.0, "reward": 0.4080950617790222, "reward_std": 0.04739544540643692, "rewards/progression_diversity/mean": -0.005925622768700123, "rewards/progression_diversity/std": 0.04622631147503853, "rewards/symbolic_reward_accuracy/mean": 0.29296875, "rewards/symbolic_reward_accuracy/std": 0.455569326877594, "rewards/symbolic_reward_partial_score/mean": 0.7745768427848816, "rewards/symbolic_reward_partial_score/std": 0.19915655255317688, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0313217639923096, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 11.25175666809082, "step": 1793 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.23954074829816818, "epoch": 2.875, "grad_norm": 0.014046828262507915, "learning_rate": 1e-06, "loss": 0.0466, "step": 1794 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24329328536987305, "epoch": 2.876602564102564, "grad_norm": 0.012684703804552555, "learning_rate": 1e-06, "loss": -0.0077, "step": 1795 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.23238325119018555, "epoch": 2.878205128205128, "grad_norm": 0.020547054708003998, "learning_rate": 1e-06, "loss": 0.0476, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1598.23828125, "completions/mean_terminated_length": 1213.0380859375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "entropy": 0.24311521649360657, "epoch": 2.8798076923076925, "frac_reward_zero_std": 0.4375, "grad_norm": 125.55624389648438, "learning_rate": 1e-06, "loss": 0.0103, "num_tokens": 1111646460.0, "reward": 0.3389037549495697, "reward_std": 0.025876596570014954, "rewards/progression_diversity/mean": -0.0070867519825696945, "rewards/progression_diversity/std": 0.045192137360572815, "rewards/symbolic_reward_accuracy/mean": 0.19921875, "rewards/symbolic_reward_accuracy/std": 0.39980348944664, "rewards/symbolic_reward_partial_score/mean": 0.7314778566360474, "rewards/symbolic_reward_partial_score/std": 0.1943960338830948, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0247465372085571, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 14.375904083251953, "step": 1797 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.23341339081525803, "epoch": 2.8814102564102564, "grad_norm": 0.018049802631139755, "learning_rate": 1e-06, "loss": 0.1091, "step": 1798 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2326470911502838, "epoch": 2.8830128205128203, "grad_norm": 0.00664905272424221, "learning_rate": 1e-06, "loss": 0.0569, "step": 1799 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2430347353219986, "epoch": 2.8846153846153846, "grad_norm": 0.00851722713559866, "learning_rate": 1e-06, "loss": 0.0132, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2379.0, "completions/mean_length": 1422.78515625, "completions/mean_terminated_length": 1155.08935546875, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "entropy": 0.24775538593530655, "epoch": 2.886217948717949, "frac_reward_zero_std": 0.5, "grad_norm": 916.2537841796875, "learning_rate": 1e-06, "loss": 0.0312, "num_tokens": 1113154542.0, "reward": 0.4589552879333496, "reward_std": 0.024608189240098, "rewards/progression_diversity/mean": -0.0053527336567640305, "rewards/progression_diversity/std": 0.040160540491342545, "rewards/symbolic_reward_accuracy/mean": 0.361328125, "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, "rewards/symbolic_reward_partial_score/mean": 0.8086751699447632, "rewards/symbolic_reward_partial_score/std": 0.18015848100185394, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0337169170379639, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 10.59257698059082, "step": 1801 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.24687150865793228, "epoch": 2.8878205128205128, "grad_norm": 829.5130615234375, "learning_rate": 1e-06, "loss": 0.0824, "step": 1802 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.25254474580287933, "epoch": 2.8894230769230766, "grad_norm": 2157.5517578125, "learning_rate": 1e-06, "loss": 0.3136, "step": 1803 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.24701299518346786, "epoch": 2.891025641025641, "grad_norm": 0.011435293592512608, "learning_rate": 1e-06, "loss": 0.0445, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 1403.126953125, "completions/mean_terminated_length": 1195.4713134765625, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "entropy": 0.23765964061021805, "epoch": 2.8926282051282053, "frac_reward_zero_std": 0.4375, "grad_norm": 398.1752014160156, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 1114716159.0, "reward": 0.28466612100601196, "reward_std": 0.02723667398095131, "rewards/progression_diversity/mean": -0.004093291237950325, "rewards/progression_diversity/std": 0.03477524220943451, "rewards/symbolic_reward_accuracy/mean": 0.123046875, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.7035807371139526, "rewards/symbolic_reward_partial_score/std": 0.18481120467185974, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0375463962554932, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 8.100292205810547, "step": 1805 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.24171607196331024, "epoch": 2.894230769230769, "grad_norm": 16.580183029174805, "learning_rate": 1e-06, "loss": -0.0029, "step": 1806 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2345980554819107, "epoch": 2.8958333333333335, "grad_norm": 0.02494910918176174, "learning_rate": 1e-06, "loss": 0.0728, "step": 1807 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2448808252811432, "epoch": 2.8974358974358974, "grad_norm": 0.008014152757823467, "learning_rate": 1e-06, "loss": 0.0123, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3170.0, "completions/mean_length": 1560.744140625, "completions/mean_terminated_length": 1204.986083984375, "completions/min_length": 486.0, "completions/min_terminated_length": 486.0, "entropy": 0.2300274297595024, "epoch": 2.8990384615384617, "frac_reward_zero_std": 0.46875, "grad_norm": 1100.1624755859375, "learning_rate": 1e-06, "loss": 0.0962, "num_tokens": 1116373148.0, "reward": 0.3304210603237152, "reward_std": 0.0337834395468235, "rewards/progression_diversity/mean": -0.007212708704173565, "rewards/progression_diversity/std": 0.04686911776661873, "rewards/symbolic_reward_accuracy/mean": 0.197265625, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.7084146738052368, "rewards/symbolic_reward_partial_score/std": 0.2101738452911377, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0344150066375732, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 9.156167030334473, "step": 1809 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2424936443567276, "epoch": 2.9006410256410255, "grad_norm": 0.019458921626210213, "learning_rate": 1e-06, "loss": -0.0052, "step": 1810 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.23142275214195251, "epoch": 2.90224358974359, "grad_norm": 0.010163613595068455, "learning_rate": 1e-06, "loss": 0.2442, "step": 1811 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24503599107265472, "epoch": 2.9038461538461537, "grad_norm": 0.008980165235698223, "learning_rate": 1e-06, "loss": 0.0205, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 1808.796875, "completions/mean_terminated_length": 1185.4176025390625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "entropy": 0.23406819999217987, "epoch": 2.905448717948718, "frac_reward_zero_std": 0.375, "grad_norm": 73.40351867675781, "learning_rate": 1e-06, "loss": 0.0672, "num_tokens": 1118158052.0, "reward": 0.42395147681236267, "reward_std": 0.051604464650154114, "rewards/progression_diversity/mean": -0.012569092214107513, "rewards/progression_diversity/std": 0.06235016882419586, "rewards/symbolic_reward_accuracy/mean": 0.330078125, "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, "rewards/symbolic_reward_partial_score/mean": 0.7547363042831421, "rewards/symbolic_reward_partial_score/std": 0.2372274249792099, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0196201801300049, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 15.569795608520508, "step": 1813 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.222855344414711, "epoch": 2.907051282051282, "grad_norm": 1161.0462646484375, "learning_rate": 1e-06, "loss": 63.2965, "step": 1814 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2360314056277275, "epoch": 2.9086538461538463, "grad_norm": 173744.71875, "learning_rate": 1e-06, "loss": 10.0443, "step": 1815 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2379593700170517, "epoch": 2.91025641025641, "grad_norm": 0.012492096051573753, "learning_rate": 1e-06, "loss": 0.0198, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1775.76171875, "completions/mean_terminated_length": 1212.7667236328125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.22906029224395752, "epoch": 2.9118589743589745, "frac_reward_zero_std": 0.28125, "grad_norm": 874.5888671875, "learning_rate": 1e-06, "loss": 0.0862, "num_tokens": 1119985066.0, "reward": 0.31860923767089844, "reward_std": 0.04233834519982338, "rewards/progression_diversity/mean": -0.011634357273578644, "rewards/progression_diversity/std": 0.059071075171232224, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.7160645127296448, "rewards/symbolic_reward_partial_score/std": 0.22811061143875122, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0222280025482178, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 15.096211433410645, "step": 1817 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.23098096251487732, "epoch": 2.9134615384615383, "grad_norm": 0.011047611013054848, "learning_rate": 1e-06, "loss": 0.0751, "step": 1818 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.22944733500480652, "epoch": 2.9150641025641026, "grad_norm": 0.008758867159485817, "learning_rate": 1e-06, "loss": 0.0656, "step": 1819 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.23350029438734055, "epoch": 2.9166666666666665, "grad_norm": 0.009284427389502525, "learning_rate": 1e-06, "loss": 0.0732, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 1809.248046875, "completions/mean_terminated_length": 1216.7784423828125, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "entropy": 0.23025192320346832, "epoch": 2.918269230769231, "frac_reward_zero_std": 0.28125, "grad_norm": 926.9324951171875, "learning_rate": 1e-06, "loss": 0.0657, "num_tokens": 1121805241.0, "reward": 0.36059021949768066, "reward_std": 0.031355924904346466, "rewards/progression_diversity/mean": -0.0122678866609931, "rewards/progression_diversity/std": 0.062373507767915726, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.7114908695220947, "rewards/symbolic_reward_partial_score/std": 0.22253166139125824, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0174920558929443, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 764.0, "sampling/sampling_logp_difference/mean": 17.77764892578125, "step": 1821 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23264270275831223, "epoch": 2.9198717948717947, "grad_norm": 0.008918493054807186, "learning_rate": 1e-06, "loss": 0.0674, "step": 1822 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.22309153527021408, "epoch": 2.921474358974359, "grad_norm": 0.0191388800740242, "learning_rate": 1e-06, "loss": 0.1132, "step": 1823 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2341051548719406, "epoch": 2.9230769230769234, "grad_norm": 0.009794514626264572, "learning_rate": 1e-06, "loss": 0.0327, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2205.0, "completions/mean_length": 1469.75, "completions/mean_terminated_length": 1142.2913818359375, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "entropy": 0.24822328984737396, "epoch": 2.9246794871794872, "frac_reward_zero_std": 0.5, "grad_norm": 0.017211588099598885, "learning_rate": 1e-06, "loss": -0.0109, "num_tokens": 1123429833.0, "reward": 0.3736184239387512, "reward_std": 0.024019591510295868, "rewards/progression_diversity/mean": -0.007301281206309795, "rewards/progression_diversity/std": 0.050444815307855606, "rewards/symbolic_reward_accuracy/mean": 0.248046875, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.7514973878860474, "rewards/symbolic_reward_partial_score/std": 0.1985999196767807, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0336096286773682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 764.0, "sampling/sampling_logp_difference/mean": 10.840036392211914, "step": 1825 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.23915088176727295, "epoch": 2.926282051282051, "grad_norm": 0.015742426738142967, "learning_rate": 1e-06, "loss": 0.0593, "step": 1826 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.23261761665344238, "epoch": 2.9278846153846154, "grad_norm": 0.004535183776170015, "learning_rate": 1e-06, "loss": 0.0502, "step": 1827 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.23625855147838593, "epoch": 2.9294871794871797, "grad_norm": 0.011525583453476429, "learning_rate": 1e-06, "loss": 0.0662, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 1803.416015625, "completions/mean_terminated_length": 1210.7093505859375, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "entropy": 0.22059651464223862, "epoch": 2.9310897435897436, "frac_reward_zero_std": 0.34375, "grad_norm": 478.7748718261719, "learning_rate": 1e-06, "loss": 0.1023, "num_tokens": 1125201182.0, "reward": 0.32501205801963806, "reward_std": 0.03538961708545685, "rewards/progression_diversity/mean": -0.011980684474110603, "rewards/progression_diversity/std": 0.05950572341680527, "rewards/symbolic_reward_accuracy/mean": 0.166015625, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.752392590045929, "rewards/symbolic_reward_partial_score/std": 0.18106764554977417, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.018181324005127, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 764.0, "sampling/sampling_logp_difference/mean": 17.928653717041016, "step": 1829 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.22678129374980927, "epoch": 2.9326923076923075, "grad_norm": 0.018833013251423836, "learning_rate": 1e-06, "loss": 0.0311, "step": 1830 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.22647760808467865, "epoch": 2.934294871794872, "grad_norm": 17.264345169067383, "learning_rate": 1e-06, "loss": 0.0067, "step": 1831 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.21786439418792725, "epoch": 2.935897435897436, "grad_norm": 0.15827719867229462, "learning_rate": 1e-06, "loss": 0.0264, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 2331.796875, "completions/mean_terminated_length": 1173.1585693359375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.21573705226182938, "epoch": 2.9375, "frac_reward_zero_std": 0.1875, "grad_norm": 590.8438110351562, "learning_rate": 1e-06, "loss": 0.0605, "num_tokens": 1127233478.0, "reward": 0.3646172285079956, "reward_std": 0.0675165057182312, "rewards/progression_diversity/mean": -0.02313985861837864, "rewards/progression_diversity/std": 0.08131039142608643, "rewards/symbolic_reward_accuracy/mean": 0.2265625, "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, "rewards/symbolic_reward_partial_score/mean": 0.7630370855331421, "rewards/symbolic_reward_partial_score/std": 0.2085689753293991, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9808230996131897, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 764.0, "sampling/sampling_logp_difference/mean": 36.245513916015625, "step": 1833 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2271483689546585, "epoch": 2.939102564102564, "grad_norm": 0.014887535013258457, "learning_rate": 1e-06, "loss": 0.0405, "step": 1834 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.21168261021375656, "epoch": 2.940705128205128, "grad_norm": 0.017205113545060158, "learning_rate": 1e-06, "loss": 0.1176, "step": 1835 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.20247046649456024, "epoch": 2.9423076923076925, "grad_norm": 0.014779583550989628, "learning_rate": 1e-06, "loss": 0.1315, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 2300.94921875, "completions/mean_terminated_length": 1171.92822265625, "completions/min_length": 449.0, "completions/min_terminated_length": 449.0, "entropy": 0.20949921756982803, "epoch": 2.9439102564102564, "frac_reward_zero_std": 0.34375, "grad_norm": 870.5208740234375, "learning_rate": 1e-06, "loss": 0.0751, "num_tokens": 1129310636.0, "reward": 0.45326170325279236, "reward_std": 0.044423650950193405, "rewards/progression_diversity/mean": -0.02392907440662384, "rewards/progression_diversity/std": 0.08498639613389969, "rewards/symbolic_reward_accuracy/mean": 0.365234375, "rewards/symbolic_reward_accuracy/std": 0.4819667339324951, "rewards/symbolic_reward_partial_score/mean": 0.781201183795929, "rewards/symbolic_reward_partial_score/std": 0.22568704187870026, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9865758419036865, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 764.0, "sampling/sampling_logp_difference/mean": 33.854530334472656, "step": 1837 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.21899111568927765, "epoch": 2.9455128205128203, "grad_norm": 0.012719093821942806, "learning_rate": 1e-06, "loss": 0.0941, "step": 1838 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.22411292791366577, "epoch": 2.9471153846153846, "grad_norm": 0.021860910579562187, "learning_rate": 1e-06, "loss": 0.0825, "step": 1839 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2084907665848732, "epoch": 2.948717948717949, "grad_norm": 0.005877522751688957, "learning_rate": 1e-06, "loss": 0.0637, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2297.0, "completions/mean_length": 2534.21484375, "completions/mean_terminated_length": 1232.098388671875, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.20938124507665634, "epoch": 2.9503205128205128, "frac_reward_zero_std": 0.125, "grad_norm": 560.36669921875, "learning_rate": 1e-06, "loss": 0.12, "num_tokens": 1131555370.0, "reward": 0.32735151052474976, "reward_std": 0.05115168169140816, "rewards/progression_diversity/mean": -0.03047451376914978, "rewards/progression_diversity/std": 0.09939130395650864, "rewards/symbolic_reward_accuracy/mean": 0.19140625, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.7093750238418579, "rewards/symbolic_reward_partial_score/std": 0.2335452288389206, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9635010361671448, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 45.37931442260742, "step": 1841 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2120969444513321, "epoch": 2.9519230769230766, "grad_norm": 381.89239501953125, "learning_rate": 1e-06, "loss": 821.5478, "step": 1842 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2079833373427391, "epoch": 2.953525641025641, "grad_norm": 0.010464330203831196, "learning_rate": 1e-06, "loss": 32.2227, "step": 1843 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.21436134725809097, "epoch": 2.9551282051282053, "grad_norm": 0.02022792212665081, "learning_rate": 1e-06, "loss": 0.1652, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 3449.130859375, "completions/mean_terminated_length": 1263.78759765625, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "entropy": 0.21006950736045837, "epoch": 2.956730769230769, "frac_reward_zero_std": 0.15625, "grad_norm": 1419.5054931640625, "learning_rate": 1e-06, "loss": 0.1083, "num_tokens": 1134191421.0, "reward": 0.2933524250984192, "reward_std": 0.04929710179567337, "rewards/progression_diversity/mean": -0.05147593468427658, "rewards/progression_diversity/std": 0.12628963589668274, "rewards/symbolic_reward_accuracy/mean": 0.150390625, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.6800781488418579, "rewards/symbolic_reward_partial_score/std": 0.23385514318943024, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9397692680358887, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 57.66311264038086, "step": 1845 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.1966499239206314, "epoch": 2.9583333333333335, "grad_norm": 16499.4140625, "learning_rate": 1e-06, "loss": 5.3166, "step": 1846 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.19310440123081207, "epoch": 2.9599358974358974, "grad_norm": 4365.603515625, "learning_rate": 1e-06, "loss": 0.5053, "step": 1847 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.1920364871621132, "epoch": 2.9615384615384617, "grad_norm": 0.009575553238391876, "learning_rate": 1e-06, "loss": 0.1589, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 3208.90625, "completions/mean_terminated_length": 1259.228759765625, "completions/min_length": 470.0, "completions/min_terminated_length": 470.0, "entropy": 0.2075439766049385, "epoch": 2.9631410256410255, "frac_reward_zero_std": 0.09375, "grad_norm": 574.5317993164062, "learning_rate": 1e-06, "loss": 0.0662, "num_tokens": 1136791005.0, "reward": 0.28958579897880554, "reward_std": 0.05835293233394623, "rewards/progression_diversity/mean": -0.04337439686059952, "rewards/progression_diversity/std": 0.11332310736179352, "rewards/symbolic_reward_accuracy/mean": 0.142578125, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.6835286617279053, "rewards/symbolic_reward_partial_score/std": 0.22093914449214935, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9411171674728394, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 56.884559631347656, "step": 1849 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.19637862592935562, "epoch": 2.96474358974359, "grad_norm": 0.008334570564329624, "learning_rate": 1e-06, "loss": 0.1807, "step": 1850 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.390625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.20494883507490158, "epoch": 2.9663461538461537, "grad_norm": 0.007710518781095743, "learning_rate": 1e-06, "loss": 0.1235, "step": 1851 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.18237695842981339, "epoch": 2.967948717948718, "grad_norm": 0.011762432754039764, "learning_rate": 1e-06, "loss": 0.1953, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 2733.078125, "completions/mean_terminated_length": 1255.70556640625, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "entropy": 0.2183959111571312, "epoch": 2.969551282051282, "frac_reward_zero_std": 0.125, "grad_norm": 1045.5770263671875, "learning_rate": 1e-06, "loss": 0.0877, "num_tokens": 1139114341.0, "reward": 0.33328425884246826, "reward_std": 0.06576712429523468, "rewards/progression_diversity/mean": -0.030461102724075317, "rewards/progression_diversity/std": 0.09234879910945892, "rewards/symbolic_reward_accuracy/mean": 0.203125, "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, "rewards/symbolic_reward_partial_score/mean": 0.7057129144668579, "rewards/symbolic_reward_partial_score/std": 0.23553626239299774, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9779849052429199, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 39.03363800048828, "step": 1853 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2225719690322876, "epoch": 2.9711538461538463, "grad_norm": 0.00991111621260643, "learning_rate": 1e-06, "loss": 0.1348, "step": 1854 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.19753701984882355, "epoch": 2.97275641025641, "grad_norm": 0.006589308846741915, "learning_rate": 1e-06, "loss": 0.145, "step": 1855 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2158377543091774, "epoch": 2.9743589743589745, "grad_norm": 0.009189927950501442, "learning_rate": 1e-06, "loss": 0.1266, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2904.0, "completions/mean_length": 2554.814453125, "completions/mean_terminated_length": 1254.6346435546875, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 0.20936352014541626, "epoch": 2.9759615384615383, "frac_reward_zero_std": 0.1875, "grad_norm": 463.4955139160156, "learning_rate": 1e-06, "loss": 0.0787, "num_tokens": 1141373350.0, "reward": 0.3864889144897461, "reward_std": 0.0557912215590477, "rewards/progression_diversity/mean": -0.02347256988286972, "rewards/progression_diversity/std": 0.07694897800683975, "rewards/symbolic_reward_accuracy/mean": 0.2734375, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.7422037720680237, "rewards/symbolic_reward_partial_score/std": 0.24117407202720642, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9877347946166992, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 33.34986114501953, "step": 1857 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.22580835223197937, "epoch": 2.9775641025641026, "grad_norm": 0.01121189258992672, "learning_rate": 1e-06, "loss": 0.0885, "step": 1858 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2194606065750122, "epoch": 2.9791666666666665, "grad_norm": 0.018390899524092674, "learning_rate": 1e-06, "loss": 0.1005, "step": 1859 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.20080887526273727, "epoch": 2.980769230769231, "grad_norm": 0.009253687225282192, "learning_rate": 1e-06, "loss": 0.1674, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 2508.169921875, "completions/mean_terminated_length": 1268.2021484375, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.23013577610254288, "epoch": 2.9823717948717947, "frac_reward_zero_std": 0.1875, "grad_norm": 397.2406311035156, "learning_rate": 1e-06, "loss": 0.0351, "num_tokens": 1143492301.0, "reward": 0.39299777150154114, "reward_std": 0.05584446340799332, "rewards/progression_diversity/mean": -0.023467358201742172, "rewards/progression_diversity/std": 0.07824753224849701, "rewards/symbolic_reward_accuracy/mean": 0.291015625, "rewards/symbolic_reward_accuracy/std": 0.45467492938041687, "rewards/symbolic_reward_partial_score/mean": 0.7287434935569763, "rewards/symbolic_reward_partial_score/std": 0.24250133335590363, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9919546842575073, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 31.701099395751953, "step": 1861 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2217669039964676, "epoch": 2.983974358974359, "grad_norm": 53.626285552978516, "learning_rate": 1e-06, "loss": 0.0267, "step": 1862 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.21721980720758438, "epoch": 2.9855769230769234, "grad_norm": 0.032945115119218826, "learning_rate": 1e-06, "loss": 0.172, "step": 1863 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.22124967724084854, "epoch": 2.9871794871794872, "grad_norm": 0.01755567640066147, "learning_rate": 1e-06, "loss": 0.093, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2574.0, "completions/mean_length": 2252.9453125, "completions/mean_terminated_length": 1279.4071044921875, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "entropy": 0.2342042252421379, "epoch": 2.988782051282051, "frac_reward_zero_std": 0.28125, "grad_norm": 481.52252197265625, "learning_rate": 1e-06, "loss": 0.1885, "num_tokens": 1145451937.0, "reward": 0.3523566424846649, "reward_std": 0.04059898108243942, "rewards/progression_diversity/mean": -0.01775522157549858, "rewards/progression_diversity/std": 0.06716568768024445, "rewards/symbolic_reward_accuracy/mean": 0.212890625, "rewards/symbolic_reward_accuracy/std": 0.409751296043396, "rewards/symbolic_reward_partial_score/mean": 0.7493326663970947, "rewards/symbolic_reward_partial_score/std": 0.22399716079235077, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0028358697891235, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 27.260162353515625, "step": 1865 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.24164949357509613, "epoch": 2.9903846153846154, "grad_norm": 14.552428245544434, "learning_rate": 1e-06, "loss": 0.0096, "step": 1866 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23062748461961746, "epoch": 2.9919871794871797, "grad_norm": 0.030370866879820824, "learning_rate": 1e-06, "loss": 0.1004, "step": 1867 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.24224235862493515, "epoch": 2.9935897435897436, "grad_norm": 0.011348499916493893, "learning_rate": 1e-06, "loss": 0.1191, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2234.0, "completions/mean_length": 2649.326171875, "completions/mean_terminated_length": 1261.09033203125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 0.24135691672563553, "epoch": 2.9951923076923075, "frac_reward_zero_std": 0.09375, "grad_norm": 1024.800537109375, "learning_rate": 1e-06, "loss": 0.0707, "num_tokens": 1147683176.0, "reward": 0.3426709771156311, "reward_std": 0.04791571944952011, "rewards/progression_diversity/mean": -0.025381293147802353, "rewards/progression_diversity/std": 0.07976257801055908, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.7212076187133789, "rewards/symbolic_reward_partial_score/std": 0.21549275517463684, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9791022539138794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 38.175758361816406, "step": 1869 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2204393520951271, "epoch": 2.996794871794872, "grad_norm": 1905.4779052734375, "learning_rate": 1e-06, "loss": 0.2076, "step": 1870 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.21573010832071304, "epoch": 2.998397435897436, "grad_norm": 12149.6005859375, "learning_rate": 1e-06, "loss": 1.1185, "step": 1871 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.2214333862066269, "epoch": 3.0, "grad_norm": 0.018644830211997032, "learning_rate": 1e-06, "loss": 0.1475, "step": 1872 }, { "epoch": 3.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.080078125, "eval_completions/max_length": 16384.0, "eval_completions/max_terminated_length": 2229.46875, "eval_completions/mean_length": 2536.551025390625, "eval_completions/mean_terminated_length": 1331.3045349121094, "eval_completions/min_length": 594.8125, "eval_completions/min_terminated_length": 594.8125, "eval_entropy": 0.20870172325521708, "eval_frac_reward_zero_std": 0.1796875, "eval_loss": 0.04187625274062157, "eval_num_tokens": 1147683176.0, "eval_reward": 0.2534480579197407, "eval_reward_std": 0.0388854734483175, "eval_rewards/progression_diversity/mean": -0.021589503419818357, "eval_rewards/progression_diversity/std": 0.07232485536951572, "eval_rewards/symbolic_reward_accuracy/mean": 0.09130859375, "eval_rewards/symbolic_reward_accuracy/std": 0.2087636678479612, "eval_rewards/symbolic_reward_partial_score/mean": 0.6634989399462938, "eval_rewards/symbolic_reward_partial_score/std": 0.1970880120061338, "eval_rewards/tag_count_reward/mean": -0.001708984375, "eval_rewards/tag_count_reward/std": 0.015794883016496897, "eval_runtime": 4301.5149, "eval_samples_per_second": 0.058, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.021749384701252, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 768.0, "eval_sampling/sampling_logp_difference/mean": 19.073409140110016, "eval_steps_per_second": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 2402.98046875, "completions/mean_terminated_length": 1313.9326171875, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "entropy": 0.23216699063777924, "epoch": 3.0016025641025643, "frac_reward_zero_std": 0.21875, "grad_norm": 1125.79931640625, "learning_rate": 1e-06, "loss": 0.0865, "num_tokens": 1149811486.0, "reward": 0.3975224494934082, "reward_std": 0.04312722384929657, "rewards/progression_diversity/mean": -0.02021792158484459, "rewards/progression_diversity/std": 0.07212448120117188, "rewards/symbolic_reward_accuracy/mean": 0.3046875, "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, "rewards/symbolic_reward_partial_score/mean": 0.7163736820220947, "rewards/symbolic_reward_partial_score/std": 0.23637627065181732, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9978500604629517, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 30.130069732666016, "step": 1873 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.23934829235076904, "epoch": 3.003205128205128, "grad_norm": 0.007034912705421448, "learning_rate": 1e-06, "loss": 0.1083, "step": 1874 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.22745058685541153, "epoch": 3.0048076923076925, "grad_norm": 0.00796093326061964, "learning_rate": 1e-06, "loss": 0.0727, "step": 1875 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2292134240269661, "epoch": 3.0064102564102564, "grad_norm": 0.019052157178521156, "learning_rate": 1e-06, "loss": 0.1367, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 1881.35546875, "completions/mean_terminated_length": 1322.429931640625, "completions/min_length": 434.0, "completions/min_terminated_length": 434.0, "entropy": 0.2549893334507942, "epoch": 3.0080128205128207, "frac_reward_zero_std": 0.3125, "grad_norm": 0.024593627080321312, "learning_rate": 1e-06, "loss": 0.0391, "num_tokens": 1151633332.0, "reward": 0.2892531454563141, "reward_std": 0.039773762226104736, "rewards/progression_diversity/mean": -0.010232724249362946, "rewards/progression_diversity/std": 0.055212657898664474, "rewards/symbolic_reward_accuracy/mean": 0.119140625, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.7275390625, "rewards/symbolic_reward_partial_score/std": 0.19641563296318054, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0221221446990967, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 19.10765838623047, "step": 1877 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.24741330742835999, "epoch": 3.0096153846153846, "grad_norm": 0.01947159692645073, "learning_rate": 1e-06, "loss": 0.1312, "step": 1878 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24799786508083344, "epoch": 3.011217948717949, "grad_norm": 0.02338356338441372, "learning_rate": 1e-06, "loss": 0.0404, "step": 1879 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2448197305202484, "epoch": 3.0128205128205128, "grad_norm": 0.014209832064807415, "learning_rate": 1e-06, "loss": 0.0894, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3167.0, "completions/mean_length": 1858.625, "completions/mean_terminated_length": 1359.7738037109375, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "entropy": 0.25408633053302765, "epoch": 3.014423076923077, "frac_reward_zero_std": 0.375, "grad_norm": 454.2471618652344, "learning_rate": 1e-06, "loss": 0.0447, "num_tokens": 1153465092.0, "reward": 0.3250887095928192, "reward_std": 0.015932850539684296, "rewards/progression_diversity/mean": -0.011149164289236069, "rewards/progression_diversity/std": 0.05865437537431717, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.7096517086029053, "rewards/symbolic_reward_partial_score/std": 0.19842353463172913, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.028539776802063, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 17.165746688842773, "step": 1881 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2505646124482155, "epoch": 3.016025641025641, "grad_norm": 0.6798326373100281, "learning_rate": 1e-06, "loss": 0.0431, "step": 1882 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.24388013035058975, "epoch": 3.0176282051282053, "grad_norm": 1.9514085054397583, "learning_rate": 1e-06, "loss": 0.1241, "step": 1883 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.26686759293079376, "epoch": 3.019230769230769, "grad_norm": 0.012397652491927147, "learning_rate": 1e-06, "loss": -0.0004, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 1794.703125, "completions/mean_terminated_length": 1414.6212158203125, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.2557114362716675, "epoch": 3.0208333333333335, "frac_reward_zero_std": 0.53125, "grad_norm": 544.6851196289062, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 1155145164.0, "reward": 0.37460383772850037, "reward_std": 0.016295883804559708, "rewards/progression_diversity/mean": -0.008367680944502354, "rewards/progression_diversity/std": 0.05215869098901749, "rewards/symbolic_reward_accuracy/mean": 0.25, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.7502604126930237, "rewards/symbolic_reward_partial_score/std": 0.1986854523420334, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.030568242073059, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 16.76722526550293, "step": 1885 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.26103954017162323, "epoch": 3.0224358974358974, "grad_norm": 91.75881958007812, "learning_rate": 1e-06, "loss": 0.2128, "step": 1886 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2607679069042206, "epoch": 3.0240384615384617, "grad_norm": 0.012660062871873379, "learning_rate": 1e-06, "loss": 0.0615, "step": 1887 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.26138727366924286, "epoch": 3.0256410256410255, "grad_norm": 0.01078207790851593, "learning_rate": 1e-06, "loss": 0.0361, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1951.89453125, "completions/mean_terminated_length": 1395.6876220703125, "completions/min_length": 593.0, "completions/min_terminated_length": 593.0, "entropy": 0.2467678338289261, "epoch": 3.02724358974359, "frac_reward_zero_std": 0.25, "grad_norm": 572.4014282226562, "learning_rate": 1e-06, "loss": 0.0566, "num_tokens": 1157118710.0, "reward": 0.226592555642128, "reward_std": 0.021917399019002914, "rewards/progression_diversity/mean": -0.01262015849351883, "rewards/progression_diversity/std": 0.06260491162538528, "rewards/symbolic_reward_accuracy/mean": 0.05859375, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.6385416984558105, "rewards/symbolic_reward_partial_score/std": 0.20352140069007874, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0226848125457764, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 19.33092498779297, "step": 1889 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.23328816145658493, "epoch": 3.0288461538461537, "grad_norm": 20633.263671875, "learning_rate": 1e-06, "loss": 4.1374, "step": 1890 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2414126992225647, "epoch": 3.030448717948718, "grad_norm": 6596.80908203125, "learning_rate": 1e-06, "loss": 0.5508, "step": 1891 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2591097354888916, "epoch": 3.032051282051282, "grad_norm": 0.01172653678804636, "learning_rate": 1e-06, "loss": 0.0182, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 1600.69921875, "completions/mean_terminated_length": 1366.043701171875, "completions/min_length": 407.0, "completions/min_terminated_length": 407.0, "entropy": 0.26757121086120605, "epoch": 3.0336538461538463, "frac_reward_zero_std": 0.375, "grad_norm": 0.026514563709497452, "learning_rate": 1e-06, "loss": 0.0088, "num_tokens": 1158840796.0, "reward": 0.22522126138210297, "reward_std": 0.04094214737415314, "rewards/progression_diversity/mean": -0.005220310762524605, "rewards/progression_diversity/std": 0.04069770127534866, "rewards/symbolic_reward_accuracy/mean": 0.0390625, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.6740885972976685, "rewards/symbolic_reward_partial_score/std": 0.1974918395280838, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041705846786499, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 10.610610008239746, "step": 1893 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.25377361476421356, "epoch": 3.03525641025641, "grad_norm": 10585.2099609375, "learning_rate": 1e-06, "loss": 0.5272, "step": 1894 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.26954758167266846, "epoch": 3.0368589743589745, "grad_norm": 0.022427378222346306, "learning_rate": 1e-06, "loss": -0.0047, "step": 1895 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2479778230190277, "epoch": 3.0384615384615383, "grad_norm": 0.013517378829419613, "learning_rate": 1e-06, "loss": 0.2005, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1637.05859375, "completions/mean_terminated_length": 1373.19677734375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "entropy": 0.2555150091648102, "epoch": 3.0400641025641026, "frac_reward_zero_std": 0.5, "grad_norm": 30.618288040161133, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 1160526426.0, "reward": 0.4024108052253723, "reward_std": 0.02461879327893257, "rewards/progression_diversity/mean": -0.005499871913343668, "rewards/progression_diversity/std": 0.04200981557369232, "rewards/symbolic_reward_accuracy/mean": 0.275390625, "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, "rewards/symbolic_reward_partial_score/mean": 0.790771484375, "rewards/symbolic_reward_partial_score/std": 0.18546874821186066, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0372226238250732, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 12.745841979980469, "step": 1897 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2539917603135109, "epoch": 3.0416666666666665, "grad_norm": 44032.421875, "learning_rate": 1e-06, "loss": 1.0033, "step": 1898 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.25875651836395264, "epoch": 3.043269230769231, "grad_norm": 0.02200913429260254, "learning_rate": 1e-06, "loss": 0.0217, "step": 1899 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.25009360164403915, "epoch": 3.0448717948717947, "grad_norm": 0.015072772279381752, "learning_rate": 1e-06, "loss": 0.0296, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 1893.123046875, "completions/mean_terminated_length": 1395.4566650390625, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "entropy": 0.25338225811719894, "epoch": 3.046474358974359, "frac_reward_zero_std": 0.34375, "grad_norm": 0.014189253561198711, "learning_rate": 1e-06, "loss": 0.0664, "num_tokens": 1162334969.0, "reward": 0.31671467423439026, "reward_std": 0.037882737815380096, "rewards/progression_diversity/mean": -0.010173956863582134, "rewards/progression_diversity/std": 0.055442675948143005, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.7090494632720947, "rewards/symbolic_reward_partial_score/std": 0.20027847588062286, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.020569920539856, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 20.987850189208984, "step": 1901 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.25274112075567245, "epoch": 3.048076923076923, "grad_norm": 0.008120290003716946, "learning_rate": 1e-06, "loss": 0.0418, "step": 1902 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.24684013426303864, "epoch": 3.0496794871794872, "grad_norm": 0.010809239000082016, "learning_rate": 1e-06, "loss": 0.0503, "step": 1903 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.25020767748355865, "epoch": 3.051282051282051, "grad_norm": 0.0168289951980114, "learning_rate": 1e-06, "loss": 0.0331, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2925.0, "completions/mean_length": 1549.30078125, "completions/mean_terminated_length": 1343.67138671875, "completions/min_length": 443.0, "completions/min_terminated_length": 443.0, "entropy": 0.2655075490474701, "epoch": 3.0528846153846154, "frac_reward_zero_std": 0.375, "grad_norm": 0.02596384286880493, "learning_rate": 1e-06, "loss": 0.0209, "num_tokens": 1164024995.0, "reward": 0.3695905804634094, "reward_std": 0.033132217824459076, "rewards/progression_diversity/mean": -0.004322149325162172, "rewards/progression_diversity/std": 0.03666413947939873, "rewards/symbolic_reward_accuracy/mean": 0.244140625, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.7438313961029053, "rewards/symbolic_reward_partial_score/std": 0.20166073739528656, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0442430973052979, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 9.987215995788574, "step": 1905 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.25963981449604034, "epoch": 3.0544871794871793, "grad_norm": 140.0711669921875, "learning_rate": 1e-06, "loss": 0.0012, "step": 1906 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.26139749586582184, "epoch": 3.0560897435897436, "grad_norm": 0.00938948430120945, "learning_rate": 1e-06, "loss": 0.0255, "step": 1907 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2722902148962021, "epoch": 3.0576923076923075, "grad_norm": 0.011267850175499916, "learning_rate": 1e-06, "loss": 0.0042, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 1744.671875, "completions/mean_terminated_length": 1333.1243896484375, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "entropy": 0.2618495374917984, "epoch": 3.059294871794872, "frac_reward_zero_std": 0.34375, "grad_norm": 652.4515380859375, "learning_rate": 1e-06, "loss": 0.0214, "num_tokens": 1165798987.0, "reward": 0.4160562753677368, "reward_std": 0.061300039291381836, "rewards/progression_diversity/mean": -0.009119954891502857, "rewards/progression_diversity/std": 0.05495789274573326, "rewards/symbolic_reward_accuracy/mean": 0.314453125, "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, "rewards/symbolic_reward_partial_score/mean": 0.7589030265808105, "rewards/symbolic_reward_partial_score/std": 0.2069811373949051, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0315742492675781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 16.712371826171875, "step": 1909 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2622825354337692, "epoch": 3.0608974358974357, "grad_norm": 0.021915055811405182, "learning_rate": 1e-06, "loss": -0.0031, "step": 1910 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2576129883527756, "epoch": 3.0625, "grad_norm": 0.013906807638704777, "learning_rate": 1e-06, "loss": 0.0443, "step": 1911 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2665242552757263, "epoch": 3.064102564102564, "grad_norm": 0.012783014215528965, "learning_rate": 1e-06, "loss": 0.0385, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1683.671875, "completions/mean_terminated_length": 1360.91015625, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "entropy": 0.25943586230278015, "epoch": 3.065705128205128, "frac_reward_zero_std": 0.25, "grad_norm": 292.14520263671875, "learning_rate": 1e-06, "loss": 0.0416, "num_tokens": 1167423891.0, "reward": 0.37675100564956665, "reward_std": 0.06550440192222595, "rewards/progression_diversity/mean": -0.005566404201090336, "rewards/progression_diversity/std": 0.03809665888547897, "rewards/symbolic_reward_accuracy/mean": 0.2578125, "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, "rewards/symbolic_reward_partial_score/mean": 0.74169921875, "rewards/symbolic_reward_partial_score/std": 0.21923932433128357, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.039749026298523, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 12.124536514282227, "step": 1913 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.27193763852119446, "epoch": 3.0673076923076925, "grad_norm": 0.013166146352887154, "learning_rate": 1e-06, "loss": 0.0071, "step": 1914 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2520969808101654, "epoch": 3.0689102564102564, "grad_norm": 3.3681511878967285, "learning_rate": 1e-06, "loss": 0.057, "step": 1915 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.26810455322265625, "epoch": 3.0705128205128207, "grad_norm": 0.6854636073112488, "learning_rate": 1e-06, "loss": 0.0013, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 1847.66796875, "completions/mean_terminated_length": 1348.4404296875, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "entropy": 0.261865496635437, "epoch": 3.0721153846153846, "frac_reward_zero_std": 0.25, "grad_norm": 45.9178581237793, "learning_rate": 1e-06, "loss": -0.0052, "num_tokens": 1169237785.0, "reward": 0.32650455832481384, "reward_std": 0.09211001545190811, "rewards/progression_diversity/mean": -0.009701091796159744, "rewards/progression_diversity/std": 0.052368562668561935, "rewards/symbolic_reward_accuracy/mean": 0.169921875, "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, "rewards/symbolic_reward_partial_score/mean": 0.750781238079071, "rewards/symbolic_reward_partial_score/std": 0.19588179886341095, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0248017311096191, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 18.72113800048828, "step": 1917 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.24243663251399994, "epoch": 3.073717948717949, "grad_norm": 1099.10400390625, "learning_rate": 1e-06, "loss": 0.1447, "step": 1918 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2528642266988754, "epoch": 3.0753205128205128, "grad_norm": 0.01181944552809, "learning_rate": 1e-06, "loss": 0.0237, "step": 1919 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.25511983036994934, "epoch": 3.076923076923077, "grad_norm": 0.016715671867132187, "learning_rate": 1e-06, "loss": 0.0151, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1621.16796875, "completions/mean_terminated_length": 1327.087646484375, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "entropy": 0.2685047686100006, "epoch": 3.078525641025641, "frac_reward_zero_std": 0.28125, "grad_norm": 0.027836158871650696, "learning_rate": 1e-06, "loss": 0.0567, "num_tokens": 1170851103.0, "reward": 0.36298680305480957, "reward_std": 0.06454600393772125, "rewards/progression_diversity/mean": -0.006008030381053686, "rewards/progression_diversity/std": 0.04314760863780975, "rewards/symbolic_reward_accuracy/mean": 0.228515625, "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, "rewards/symbolic_reward_partial_score/mean": 0.7537760734558105, "rewards/symbolic_reward_partial_score/std": 0.1955997198820114, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0417566299438477, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 11.107623100280762, "step": 1921 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.25865650177001953, "epoch": 3.0801282051282053, "grad_norm": 3558.986328125, "learning_rate": 1e-06, "loss": 0.0839, "step": 1922 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2664439529180527, "epoch": 3.081730769230769, "grad_norm": 0.014974008314311504, "learning_rate": 1e-06, "loss": -0.005, "step": 1923 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.25956542789936066, "epoch": 3.0833333333333335, "grad_norm": 0.009708881378173828, "learning_rate": 1e-06, "loss": 0.0435, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 2362.03515625, "completions/mean_terminated_length": 1301.550537109375, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "entropy": 0.24602339416742325, "epoch": 3.0849358974358974, "frac_reward_zero_std": 0.34375, "grad_norm": 400.1549072265625, "learning_rate": 1e-06, "loss": 0.0486, "num_tokens": 1172988833.0, "reward": 0.3911093473434448, "reward_std": 0.03172388672828674, "rewards/progression_diversity/mean": -0.020903022959828377, "rewards/progression_diversity/std": 0.0763976201415062, "rewards/symbolic_reward_accuracy/mean": 0.2734375, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.759472668170929, "rewards/symbolic_reward_partial_score/std": 0.20224013924598694, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9978427886962891, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 31.93221664428711, "step": 1925 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.25485508143901825, "epoch": 3.0865384615384617, "grad_norm": 0.007848401553928852, "learning_rate": 1e-06, "loss": 0.0098, "step": 1926 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.23554416000843048, "epoch": 3.0881410256410255, "grad_norm": 0.011878496035933495, "learning_rate": 1e-06, "loss": 0.092, "step": 1927 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.21972551941871643, "epoch": 3.08974358974359, "grad_norm": 0.0162891186773777, "learning_rate": 1e-06, "loss": 0.1825, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 3023.3359375, "completions/mean_terminated_length": 1349.58251953125, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "entropy": 0.220417819917202, "epoch": 3.0913461538461537, "frac_reward_zero_std": 0.15625, "grad_norm": 1063.9591064453125, "learning_rate": 1e-06, "loss": 0.1881, "num_tokens": 1175419421.0, "reward": 0.32698261737823486, "reward_std": 0.024275176227092743, "rewards/progression_diversity/mean": -0.031721051782369614, "rewards/progression_diversity/std": 0.09057775884866714, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.7316243648529053, "rewards/symbolic_reward_partial_score/std": 0.19131198525428772, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9758140444755554, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 42.80847930908203, "step": 1929 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2313474789261818, "epoch": 3.092948717948718, "grad_norm": 0.009462164714932442, "learning_rate": 1e-06, "loss": 0.0303, "step": 1930 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.23852744698524475, "epoch": 3.094551282051282, "grad_norm": 0.00927420798689127, "learning_rate": 1e-06, "loss": 0.0868, "step": 1931 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.21710015833377838, "epoch": 3.0961538461538463, "grad_norm": 0.015988515689969063, "learning_rate": 1e-06, "loss": 0.1895, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 2461.416015625, "completions/mean_terminated_length": 1376.919921875, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "entropy": 0.24317217618227005, "epoch": 3.09775641025641, "frac_reward_zero_std": 0.3125, "grad_norm": 1258.6036376953125, "learning_rate": 1e-06, "loss": 0.09, "num_tokens": 1177535618.0, "reward": 0.2823217809200287, "reward_std": 0.015544566325843334, "rewards/progression_diversity/mean": -0.020752854645252228, "rewards/progression_diversity/std": 0.07482607662677765, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.6937174797058105, "rewards/symbolic_reward_partial_score/std": 0.17785848677158356, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0054988861083984, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 28.267414093017578, "step": 1933 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.24698983132839203, "epoch": 3.0993589743589745, "grad_norm": 2.280089855194092, "learning_rate": 1e-06, "loss": 0.063, "step": 1934 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.24258919060230255, "epoch": 3.1009615384615383, "grad_norm": 0.01370900496840477, "learning_rate": 1e-06, "loss": 0.0806, "step": 1935 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.24202165007591248, "epoch": 3.1025641025641026, "grad_norm": 0.010354790836572647, "learning_rate": 1e-06, "loss": 0.0415, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 2724.0078125, "completions/mean_terminated_length": 1343.3204345703125, "completions/min_length": 516.0, "completions/min_terminated_length": 516.0, "entropy": 0.23228947818279266, "epoch": 3.1041666666666665, "frac_reward_zero_std": 0.28125, "grad_norm": 134.23583984375, "learning_rate": 1e-06, "loss": 0.0476, "num_tokens": 1179819430.0, "reward": 0.35509487986564636, "reward_std": 0.032482314854860306, "rewards/progression_diversity/mean": -0.02615930885076523, "rewards/progression_diversity/std": 0.08310631662607193, "rewards/symbolic_reward_accuracy/mean": 0.21484375, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.7567870616912842, "rewards/symbolic_reward_partial_score/std": 0.19756251573562622, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9856817722320557, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 37.22113800048828, "step": 1937 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.21816729754209518, "epoch": 3.105769230769231, "grad_norm": 22.10169792175293, "learning_rate": 1e-06, "loss": 0.1096, "step": 1938 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.23560451716184616, "epoch": 3.1073717948717947, "grad_norm": 861.4119873046875, "learning_rate": 1e-06, "loss": 0.1194, "step": 1939 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.237302266061306, "epoch": 3.108974358974359, "grad_norm": 0.013657595030963421, "learning_rate": 1e-06, "loss": 0.0515, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 2930.619140625, "completions/mean_terminated_length": 1377.1785888671875, "completions/min_length": 395.0, "completions/min_terminated_length": 395.0, "entropy": 0.23268328607082367, "epoch": 3.110576923076923, "frac_reward_zero_std": 0.25, "grad_norm": 118.8416976928711, "learning_rate": 1e-06, "loss": 0.0817, "num_tokens": 1182230323.0, "reward": 0.37685173749923706, "reward_std": 0.045152001082897186, "rewards/progression_diversity/mean": -0.029186122119426727, "rewards/progression_diversity/std": 0.08660390228033066, "rewards/symbolic_reward_accuracy/mean": 0.2578125, "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, "rewards/symbolic_reward_partial_score/mean": 0.7434733510017395, "rewards/symbolic_reward_partial_score/std": 0.20847611129283905, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9919402599334717, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 33.77556610107422, "step": 1941 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2476828247308731, "epoch": 3.1121794871794872, "grad_norm": 2530.47900390625, "learning_rate": 1e-06, "loss": 0.222, "step": 1942 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22099629044532776, "epoch": 3.113782051282051, "grad_norm": 0.0288047194480896, "learning_rate": 1e-06, "loss": 0.1299, "step": 1943 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2267403081059456, "epoch": 3.1153846153846154, "grad_norm": 0.01607387140393257, "learning_rate": 1e-06, "loss": 0.1327, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 2772.017578125, "completions/mean_terminated_length": 1396.182861328125, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "entropy": 0.2250806912779808, "epoch": 3.1169871794871793, "frac_reward_zero_std": 0.28125, "grad_norm": 1792.7398681640625, "learning_rate": 1e-06, "loss": 0.1175, "num_tokens": 1184533004.0, "reward": 0.28835493326187134, "reward_std": 0.013276607729494572, "rewards/progression_diversity/mean": -0.02485804632306099, "rewards/progression_diversity/std": 0.07890177518129349, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.7126628160476685, "rewards/symbolic_reward_partial_score/std": 0.17205090820789337, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.004730463027954, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 27.43810272216797, "step": 1945 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2443278580904007, "epoch": 3.1185897435897436, "grad_norm": 4188.24609375, "learning_rate": 1e-06, "loss": 0.5792, "step": 1946 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2495875358581543, "epoch": 3.1201923076923075, "grad_norm": 0.09753794223070145, "learning_rate": 1e-06, "loss": 0.0398, "step": 1947 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24468916654586792, "epoch": 3.121794871794872, "grad_norm": 0.011647003702819347, "learning_rate": 1e-06, "loss": 0.0852, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2344.0, "completions/mean_length": 2688.154296875, "completions/mean_terminated_length": 1432.45849609375, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "entropy": 0.2420196533203125, "epoch": 3.123397435897436, "frac_reward_zero_std": 0.28125, "grad_norm": 359.50616455078125, "learning_rate": 1e-06, "loss": 0.1009, "num_tokens": 1186778155.0, "reward": 0.32234811782836914, "reward_std": 0.037715356796979904, "rewards/progression_diversity/mean": -0.022023096680641174, "rewards/progression_diversity/std": 0.07346688210964203, "rewards/symbolic_reward_accuracy/mean": 0.177734375, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.7197591066360474, "rewards/symbolic_reward_partial_score/std": 0.20028026401996613, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001284122467041, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 28.891010284423828, "step": 1949 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.24103453755378723, "epoch": 3.125, "grad_norm": 589.58740234375, "learning_rate": 1e-06, "loss": 2.001, "step": 1950 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.24162401258945465, "epoch": 3.126602564102564, "grad_norm": 5572.74560546875, "learning_rate": 1e-06, "loss": 0.4189, "step": 1951 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.234800323843956, "epoch": 3.128205128205128, "grad_norm": 475.8064270019531, "learning_rate": 1e-06, "loss": 0.2053, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 3146.59375, "completions/mean_terminated_length": 1455.4713134765625, "completions/min_length": 452.0, "completions/min_terminated_length": 452.0, "entropy": 0.2195385992527008, "epoch": 3.1298076923076925, "frac_reward_zero_std": 0.15625, "grad_norm": 503.7859802246094, "learning_rate": 1e-06, "loss": 0.1427, "num_tokens": 1189316267.0, "reward": 0.269339382648468, "reward_std": 0.020436681807041168, "rewards/progression_diversity/mean": -0.027488097548484802, "rewards/progression_diversity/std": 0.0782787874341011, "rewards/symbolic_reward_accuracy/mean": 0.09765625, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.7034016847610474, "rewards/symbolic_reward_partial_score/std": 0.17726561427116394, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9874823093414307, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 35.55792999267578, "step": 1953 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.25392740964889526, "epoch": 3.1314102564102564, "grad_norm": 6332.8427734375, "learning_rate": 1e-06, "loss": 0.1834, "step": 1954 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2183244675397873, "epoch": 3.1330128205128207, "grad_norm": 0.011082753539085388, "learning_rate": 1e-06, "loss": 0.1257, "step": 1955 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.21788176894187927, "epoch": 3.1346153846153846, "grad_norm": 0.012296471744775772, "learning_rate": 1e-06, "loss": 0.1528, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.103515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 3008.599609375, "completions/mean_terminated_length": 1464.163330078125, "completions/min_length": 584.0, "completions/min_terminated_length": 584.0, "entropy": 0.2399488091468811, "epoch": 3.136217948717949, "frac_reward_zero_std": 0.15625, "grad_norm": 634.225341796875, "learning_rate": 1e-06, "loss": 0.1405, "num_tokens": 1191638702.0, "reward": 0.31707698106765747, "reward_std": 0.049435004591941833, "rewards/progression_diversity/mean": -0.02521352283656597, "rewards/progression_diversity/std": 0.07498207688331604, "rewards/symbolic_reward_accuracy/mean": 0.169921875, "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, "rewards/symbolic_reward_partial_score/mean": 0.7185709476470947, "rewards/symbolic_reward_partial_score/std": 0.21633154153823853, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.985991895198822, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 37.30690002441406, "step": 1957 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2299083024263382, "epoch": 3.1378205128205128, "grad_norm": 0.010621101595461369, "learning_rate": 1e-06, "loss": 0.1298, "step": 1958 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2408609539270401, "epoch": 3.139423076923077, "grad_norm": 0.011294880881905556, "learning_rate": 1e-06, "loss": 0.0936, "step": 1959 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.23295968770980835, "epoch": 3.141025641025641, "grad_norm": 0.014650301076471806, "learning_rate": 1e-06, "loss": 0.0687, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 3111.46875, "completions/mean_terminated_length": 1415.8590087890625, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.24924977123737335, "epoch": 3.1426282051282053, "frac_reward_zero_std": 0.1875, "grad_norm": 237.02276611328125, "learning_rate": 1e-06, "loss": 0.0949, "num_tokens": 1194090830.0, "reward": 0.3610386848449707, "reward_std": 0.037800274789333344, "rewards/progression_diversity/mean": -0.028455514460802078, "rewards/progression_diversity/std": 0.08039110153913498, "rewards/symbolic_reward_accuracy/mean": 0.224609375, "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, "rewards/symbolic_reward_partial_score/mean": 0.7551920413970947, "rewards/symbolic_reward_partial_score/std": 0.19215147197246552, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9815695285797119, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 40.95595169067383, "step": 1961 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.22469578683376312, "epoch": 3.144230769230769, "grad_norm": 0.009706591255962849, "learning_rate": 1e-06, "loss": 0.1529, "step": 1962 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.22220056504011154, "epoch": 3.1458333333333335, "grad_norm": 0.02369273081421852, "learning_rate": 1e-06, "loss": 0.1648, "step": 1963 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.25326360762119293, "epoch": 3.1474358974358974, "grad_norm": 0.015049627050757408, "learning_rate": 1e-06, "loss": 0.051, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2528.0, "completions/mean_length": 2517.6328125, "completions/mean_terminated_length": 1405.9830322265625, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "entropy": 0.25177982449531555, "epoch": 3.1490384615384617, "frac_reward_zero_std": 0.3125, "grad_norm": 872.0093994140625, "learning_rate": 1e-06, "loss": 0.0878, "num_tokens": 1196269138.0, "reward": 0.3650954067707062, "reward_std": 0.032753195613622665, "rewards/progression_diversity/mean": -0.019758375361561775, "rewards/progression_diversity/std": 0.07037756592035294, "rewards/symbolic_reward_accuracy/mean": 0.23828125, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.7410807013511658, "rewards/symbolic_reward_partial_score/std": 0.20055758953094482, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.004058599472046, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 30.5396671295166, "step": 1965 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.25476063042879105, "epoch": 3.1506410256410255, "grad_norm": 0.01801050268113613, "learning_rate": 1e-06, "loss": 0.0639, "step": 1966 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.24143607169389725, "epoch": 3.15224358974359, "grad_norm": 0.009572253562510014, "learning_rate": 1e-06, "loss": 0.114, "step": 1967 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2614782154560089, "epoch": 3.1538461538461537, "grad_norm": 0.02647644467651844, "learning_rate": 1e-06, "loss": 0.04, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 2340.412109375, "completions/mean_terminated_length": 1497.21533203125, "completions/min_length": 473.0, "completions/min_terminated_length": 473.0, "entropy": 0.2505124807357788, "epoch": 3.155448717948718, "frac_reward_zero_std": 0.25, "grad_norm": 1409.9781494140625, "learning_rate": 1e-06, "loss": 0.1449, "num_tokens": 1198283685.0, "reward": 0.4389854073524475, "reward_std": 0.043696288019418716, "rewards/progression_diversity/mean": -0.015524028800427914, "rewards/progression_diversity/std": 0.06471197307109833, "rewards/symbolic_reward_accuracy/mean": 0.34765625, "rewards/symbolic_reward_accuracy/std": 0.47669193148612976, "rewards/symbolic_reward_partial_score/mean": 0.7691406011581421, "rewards/symbolic_reward_partial_score/std": 0.21009880304336548, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0053313970565796, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 29.34543800354004, "step": 1969 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2559570074081421, "epoch": 3.157051282051282, "grad_norm": 1054.045654296875, "learning_rate": 1e-06, "loss": 0.082, "step": 1970 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2517922967672348, "epoch": 3.1586538461538463, "grad_norm": 10.967491149902344, "learning_rate": 1e-06, "loss": -0.0051, "step": 1971 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24725372344255447, "epoch": 3.16025641025641, "grad_norm": 0.010240145027637482, "learning_rate": 1e-06, "loss": 0.0999, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 2950.55859375, "completions/mean_terminated_length": 1496.722900390625, "completions/min_length": 423.0, "completions/min_terminated_length": 423.0, "entropy": 0.25035959482192993, "epoch": 3.1618589743589745, "frac_reward_zero_std": 0.25, "grad_norm": 633.6778564453125, "learning_rate": 1e-06, "loss": 0.0946, "num_tokens": 1200602867.0, "reward": 0.33387240767478943, "reward_std": 0.026298727840185165, "rewards/progression_diversity/mean": -0.02584683895111084, "rewards/progression_diversity/std": 0.07925648987293243, "rewards/symbolic_reward_accuracy/mean": 0.197265625, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.71923828125, "rewards/symbolic_reward_partial_score/std": 0.1927872896194458, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9945404529571533, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 35.76648712158203, "step": 1973 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.25055961310863495, "epoch": 3.1634615384615383, "grad_norm": 278.57867431640625, "learning_rate": 1e-06, "loss": 0.0723, "step": 1974 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23513774573802948, "epoch": 3.1650641025641026, "grad_norm": 0.15121746063232422, "learning_rate": 1e-06, "loss": 0.1125, "step": 1975 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2441992089152336, "epoch": 3.1666666666666665, "grad_norm": 0.013731342740356922, "learning_rate": 1e-06, "loss": 0.0602, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 3148.57421875, "completions/mean_terminated_length": 1457.704833984375, "completions/min_length": 573.0, "completions/min_terminated_length": 573.0, "entropy": 0.21663058549165726, "epoch": 3.168269230769231, "frac_reward_zero_std": 0.09375, "grad_norm": 515.6699829101562, "learning_rate": 1e-06, "loss": 0.0905, "num_tokens": 1203158377.0, "reward": 0.2782261371612549, "reward_std": 0.03023504465818405, "rewards/progression_diversity/mean": -0.029924802482128143, "rewards/progression_diversity/std": 0.08487491309642792, "rewards/symbolic_reward_accuracy/mean": 0.119140625, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.6914387941360474, "rewards/symbolic_reward_partial_score/std": 0.19170799851417542, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9659879207611084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 49.51683807373047, "step": 1977 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2332163155078888, "epoch": 3.1698717948717947, "grad_norm": 2.1581897735595703, "learning_rate": 1e-06, "loss": 0.0941, "step": 1978 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.21122775226831436, "epoch": 3.171474358974359, "grad_norm": 0.049657199531793594, "learning_rate": 1e-06, "loss": 0.1684, "step": 1979 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.24628569930791855, "epoch": 3.173076923076923, "grad_norm": 0.019608579576015472, "learning_rate": 1e-06, "loss": 0.0439, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3310.0, "completions/mean_length": 2705.47265625, "completions/mean_terminated_length": 1451.3646240234375, "completions/min_length": 542.0, "completions/min_terminated_length": 542.0, "entropy": 0.26659801602363586, "epoch": 3.1746794871794872, "frac_reward_zero_std": 0.21875, "grad_norm": 256.61602783203125, "learning_rate": 1e-06, "loss": 0.0419, "num_tokens": 1205377627.0, "reward": 0.40071532130241394, "reward_std": 0.029286310076713562, "rewards/progression_diversity/mean": -0.022705569863319397, "rewards/progression_diversity/std": 0.07598486542701721, "rewards/symbolic_reward_accuracy/mean": 0.27734375, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.7830891609191895, "rewards/symbolic_reward_partial_score/std": 0.1975134015083313, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9810075759887695, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 43.57196044921875, "step": 1981 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23883583396673203, "epoch": 3.176282051282051, "grad_norm": 2.147189140319824, "learning_rate": 1e-06, "loss": 0.0706, "step": 1982 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.26977184414863586, "epoch": 3.1778846153846154, "grad_norm": 78.77035522460938, "learning_rate": 1e-06, "loss": 0.0521, "step": 1983 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23553596436977386, "epoch": 3.1794871794871793, "grad_norm": 0.018339024856686592, "learning_rate": 1e-06, "loss": 0.5282, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 2496.416015625, "completions/mean_terminated_length": 1477.410888671875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "entropy": 0.2516746520996094, "epoch": 3.1810897435897436, "frac_reward_zero_std": 0.25, "grad_norm": 319.26788330078125, "learning_rate": 1e-06, "loss": 0.0477, "num_tokens": 1207524576.0, "reward": 0.3050941824913025, "reward_std": 0.039040304720401764, "rewards/progression_diversity/mean": -0.017927538603544235, "rewards/progression_diversity/std": 0.06751146912574768, "rewards/symbolic_reward_accuracy/mean": 0.146484375, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.7259114980697632, "rewards/symbolic_reward_partial_score/std": 0.20375892519950867, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0033478736877441, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 31.887889862060547, "step": 1985 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.26808346807956696, "epoch": 3.1826923076923075, "grad_norm": 0.020371215417981148, "learning_rate": 1e-06, "loss": 0.0406, "step": 1986 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.24439021944999695, "epoch": 3.184294871794872, "grad_norm": 0.026235945522785187, "learning_rate": 1e-06, "loss": 0.0765, "step": 1987 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.25858327746391296, "epoch": 3.185897435897436, "grad_norm": 0.016966918483376503, "learning_rate": 1e-06, "loss": 0.0628, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 1891.138671875, "completions/mean_terminated_length": 1363.0587158203125, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "entropy": 0.28316083550453186, "epoch": 3.1875, "frac_reward_zero_std": 0.3125, "grad_norm": 3698.61865234375, "learning_rate": 1e-06, "loss": 0.0706, "num_tokens": 1209406615.0, "reward": 0.3008219003677368, "reward_std": 0.029717685654759407, "rewards/progression_diversity/mean": -0.010097447782754898, "rewards/progression_diversity/std": 0.053449101746082306, "rewards/symbolic_reward_accuracy/mean": 0.142578125, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.7185709476470947, "rewards/symbolic_reward_partial_score/std": 0.17875374853610992, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0198476314544678, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 25.146913528442383, "step": 1989 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2714526355266571, "epoch": 3.189102564102564, "grad_norm": 7592.54052734375, "learning_rate": 1e-06, "loss": 0.8777, "step": 1990 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.28693751990795135, "epoch": 3.190705128205128, "grad_norm": 20502.578125, "learning_rate": 1e-06, "loss": 2.4146, "step": 1991 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.283732146024704, "epoch": 3.1923076923076925, "grad_norm": 0.011515190824866295, "learning_rate": 1e-06, "loss": 0.043, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2856.0, "completions/mean_length": 1891.53125, "completions/mean_terminated_length": 1363.465576171875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.2893388271331787, "epoch": 3.1939102564102564, "frac_reward_zero_std": 0.375, "grad_norm": 136.03277587890625, "learning_rate": 1e-06, "loss": 0.0309, "num_tokens": 1211197511.0, "reward": 0.38298678398132324, "reward_std": 0.05362379923462868, "rewards/progression_diversity/mean": -0.010407653637230396, "rewards/progression_diversity/std": 0.05494730547070503, "rewards/symbolic_reward_accuracy/mean": 0.26171875, "rewards/symbolic_reward_accuracy/std": 0.44000017642974854, "rewards/symbolic_reward_partial_score/mean": 0.754833996295929, "rewards/symbolic_reward_partial_score/std": 0.21281343698501587, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0291106700897217, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 21.492839813232422, "step": 1993 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2943323999643326, "epoch": 3.1955128205128207, "grad_norm": 0.021322406828403473, "learning_rate": 1e-06, "loss": 0.033, "step": 1994 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.29063861072063446, "epoch": 3.1971153846153846, "grad_norm": 0.012699613347649574, "learning_rate": 1e-06, "loss": 0.0278, "step": 1995 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.28081028163433075, "epoch": 3.198717948717949, "grad_norm": 0.00812804140150547, "learning_rate": 1e-06, "loss": 0.0823, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 1646.9921875, "completions/mean_terminated_length": 1353.4263916015625, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "entropy": 0.30514436960220337, "epoch": 3.2003205128205128, "frac_reward_zero_std": 0.4375, "grad_norm": 60.97207260131836, "learning_rate": 1e-06, "loss": -0.0098, "num_tokens": 1212900051.0, "reward": 0.32839280366897583, "reward_std": 0.04186505824327469, "rewards/progression_diversity/mean": -0.005448305979371071, "rewards/progression_diversity/std": 0.039716459810733795, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.7497720718383789, "rewards/symbolic_reward_partial_score/std": 0.1919063776731491, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0453128814697266, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 13.722127914428711, "step": 1997 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.29429739713668823, "epoch": 3.201923076923077, "grad_norm": 349.4608459472656, "learning_rate": 1e-06, "loss": 0.0811, "step": 1998 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.30618786811828613, "epoch": 3.203525641025641, "grad_norm": 0.018446296453475952, "learning_rate": 1e-06, "loss": 0.0303, "step": 1999 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.29249128699302673, "epoch": 3.2051282051282053, "grad_norm": 0.015572361648082733, "learning_rate": 1e-06, "loss": 0.0433, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 1567.892578125, "completions/mean_terminated_length": 1362.5208740234375, "completions/min_length": 600.0, "completions/min_terminated_length": 600.0, "entropy": 0.2950544059276581, "epoch": 3.206730769230769, "frac_reward_zero_std": 0.34375, "grad_norm": 891.14892578125, "learning_rate": 1e-06, "loss": 0.0646, "num_tokens": 1214607692.0, "reward": 0.26205992698669434, "reward_std": 0.03215021640062332, "rewards/progression_diversity/mean": -0.003969744313508272, "rewards/progression_diversity/std": 0.03686157613992691, "rewards/symbolic_reward_accuracy/mean": 0.087890625, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.6991862058639526, "rewards/symbolic_reward_partial_score/std": 0.1865617036819458, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0514659881591797, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 10.364349365234375, "step": 2001 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.29572050273418427, "epoch": 3.2083333333333335, "grad_norm": 0.008014540188014507, "learning_rate": 1e-06, "loss": 0.0259, "step": 2002 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2993382215499878, "epoch": 3.2099358974358974, "grad_norm": 0.021720819175243378, "learning_rate": 1e-06, "loss": -0.0056, "step": 2003 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3016369044780731, "epoch": 3.2115384615384617, "grad_norm": 0.0189630426466465, "learning_rate": 1e-06, "loss": 0.0159, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3574.0, "completions/mean_length": 1687.447265625, "completions/mean_terminated_length": 1364.7684326171875, "completions/min_length": 483.0, "completions/min_terminated_length": 483.0, "entropy": 0.29340440034866333, "epoch": 3.2131410256410255, "frac_reward_zero_std": 0.5, "grad_norm": 509.89044189453125, "learning_rate": 1e-06, "loss": 0.0367, "num_tokens": 1216400945.0, "reward": 0.3805909752845764, "reward_std": 0.03878272697329521, "rewards/progression_diversity/mean": -0.007796908728778362, "rewards/progression_diversity/std": 0.05300579220056534, "rewards/symbolic_reward_accuracy/mean": 0.259765625, "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, "rewards/symbolic_reward_partial_score/mean": 0.7513183355331421, "rewards/symbolic_reward_partial_score/std": 0.19548480212688446, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0402953624725342, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 16.1550235748291, "step": 2005 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.30069899559020996, "epoch": 3.21474358974359, "grad_norm": 0.02268948033452034, "learning_rate": 1e-06, "loss": 0.0185, "step": 2006 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3007173389196396, "epoch": 3.2163461538461537, "grad_norm": 0.008483972400426865, "learning_rate": 1e-06, "loss": 0.0269, "step": 2007 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2845690995454788, "epoch": 3.217948717948718, "grad_norm": 0.010253841057419777, "learning_rate": 1e-06, "loss": 0.0552, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 1595.154296875, "completions/mean_terminated_length": 1360.4107666015625, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "entropy": 0.28091390430927277, "epoch": 3.219551282051282, "frac_reward_zero_std": 0.5625, "grad_norm": 675.5444946289062, "learning_rate": 1e-06, "loss": 0.0337, "num_tokens": 1218156384.0, "reward": 0.2970985472202301, "reward_std": 0.03322647139430046, "rewards/progression_diversity/mean": -0.005477225407958031, "rewards/progression_diversity/std": 0.044522009789943695, "rewards/symbolic_reward_accuracy/mean": 0.142578125, "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, "rewards/symbolic_reward_partial_score/mean": 0.7066569328308105, "rewards/symbolic_reward_partial_score/std": 0.1901051253080368, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0481798648834229, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 11.399556159973145, "step": 2009 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29631251096725464, "epoch": 3.2211538461538463, "grad_norm": 0.01047491654753685, "learning_rate": 1e-06, "loss": 0.004, "step": 2010 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2931921035051346, "epoch": 3.22275641025641, "grad_norm": 0.03937433660030365, "learning_rate": 1e-06, "loss": 0.0033, "step": 2011 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.29075874388217926, "epoch": 3.2243589743589745, "grad_norm": 0.010890385136008263, "learning_rate": 1e-06, "loss": 0.017, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 1558.37890625, "completions/mean_terminated_length": 1382.5810546875, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.282899871468544, "epoch": 3.2259615384615383, "frac_reward_zero_std": 0.5, "grad_norm": 980.3523559570312, "learning_rate": 1e-06, "loss": 0.0471, "num_tokens": 1219817762.0, "reward": 0.3504934310913086, "reward_std": 0.030621502548456192, "rewards/progression_diversity/mean": -0.004368623252958059, "rewards/progression_diversity/std": 0.040239714086055756, "rewards/symbolic_reward_accuracy/mean": 0.228515625, "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, "rewards/symbolic_reward_partial_score/mean": 0.71142578125, "rewards/symbolic_reward_partial_score/std": 0.2032359093427658, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0505871772766113, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 9.663378715515137, "step": 2013 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2933056503534317, "epoch": 3.2275641025641026, "grad_norm": 0.016422249376773834, "learning_rate": 1e-06, "loss": 0.0012, "step": 2014 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2866353541612625, "epoch": 3.2291666666666665, "grad_norm": 0.020436976104974747, "learning_rate": 1e-06, "loss": 0.0241, "step": 2015 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.28925251960754395, "epoch": 3.230769230769231, "grad_norm": 0.018621278926730156, "learning_rate": 1e-06, "loss": 0.0175, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 1511.5234375, "completions/mean_terminated_length": 1335.1700439453125, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "entropy": 0.29188382625579834, "epoch": 3.2323717948717947, "frac_reward_zero_std": 0.375, "grad_norm": 0.03130757808685303, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 1221518830.0, "reward": 0.3500271439552307, "reward_std": 0.07550652325153351, "rewards/progression_diversity/mean": -0.0036349566653370857, "rewards/progression_diversity/std": 0.03353985399007797, "rewards/symbolic_reward_accuracy/mean": 0.23828125, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.6909668445587158, "rewards/symbolic_reward_partial_score/std": 0.22590909898281097, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0503188371658325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 8.717741012573242, "step": 2017 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2773764133453369, "epoch": 3.233974358974359, "grad_norm": 0.017145292833447456, "learning_rate": 1e-06, "loss": 0.0126, "step": 2018 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.27354469895362854, "epoch": 3.235576923076923, "grad_norm": 0.011831255629658699, "learning_rate": 1e-06, "loss": 0.0443, "step": 2019 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2817593514919281, "epoch": 3.2371794871794872, "grad_norm": 0.016237668693065643, "learning_rate": 1e-06, "loss": 0.0098, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1795.888671875, "completions/mean_terminated_length": 1325.304443359375, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "entropy": 0.2711557298898697, "epoch": 3.238782051282051, "frac_reward_zero_std": 0.40625, "grad_norm": 163.37515258789062, "learning_rate": 1e-06, "loss": 0.0598, "num_tokens": 1223317701.0, "reward": 0.3071795105934143, "reward_std": 0.022627130150794983, "rewards/progression_diversity/mean": -0.010563986375927925, "rewards/progression_diversity/std": 0.05871390923857689, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.6570963263511658, "rewards/symbolic_reward_partial_score/std": 0.21646232903003693, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0290942192077637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 19.123592376708984, "step": 2021 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2706785798072815, "epoch": 3.2403846153846154, "grad_norm": 0.007887561805546284, "learning_rate": 1e-06, "loss": 0.0574, "step": 2022 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2705296725034714, "epoch": 3.2419871794871793, "grad_norm": 0.016599016264081, "learning_rate": 1e-06, "loss": 0.0139, "step": 2023 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.26619046926498413, "epoch": 3.2435897435897436, "grad_norm": 0.014114703983068466, "learning_rate": 1e-06, "loss": 0.049, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1582.685546875, "completions/mean_terminated_length": 1317.850830078125, "completions/min_length": 603.0, "completions/min_terminated_length": 603.0, "entropy": 0.27589017152786255, "epoch": 3.2451923076923075, "frac_reward_zero_std": 0.5, "grad_norm": 714.7753295898438, "learning_rate": 1e-06, "loss": 0.0405, "num_tokens": 1224991732.0, "reward": 0.32780733704566956, "reward_std": 0.02535293623805046, "rewards/progression_diversity/mean": -0.006377549842000008, "rewards/progression_diversity/std": 0.04781962186098099, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.7276692390441895, "rewards/symbolic_reward_partial_score/std": 0.2073507308959961, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.040276288986206, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 13.916572570800781, "step": 2025 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.278276264667511, "epoch": 3.246794871794872, "grad_norm": 641.1627807617188, "learning_rate": 1e-06, "loss": 0.081, "step": 2026 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2818632125854492, "epoch": 3.248397435897436, "grad_norm": 0.015987196937203407, "learning_rate": 1e-06, "loss": 0.0139, "step": 2027 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2770925313234329, "epoch": 3.25, "grad_norm": 0.011531771160662174, "learning_rate": 1e-06, "loss": -0.0061, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 1594.681640625, "completions/mean_terminated_length": 1330.0615234375, "completions/min_length": 639.0, "completions/min_terminated_length": 639.0, "entropy": 0.28490838408470154, "epoch": 3.251602564102564, "frac_reward_zero_std": 0.53125, "grad_norm": 630.0040283203125, "learning_rate": 1e-06, "loss": 0.0141, "num_tokens": 1226658273.0, "reward": 0.3503781259059906, "reward_std": 0.02471214532852173, "rewards/progression_diversity/mean": -0.0061342837288975716, "rewards/progression_diversity/std": 0.0460776686668396, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.7469075322151184, "rewards/symbolic_reward_partial_score/std": 0.19170016050338745, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.040297508239746, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 14.458892822265625, "step": 2029 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.28325770795345306, "epoch": 3.253205128205128, "grad_norm": 0.02777559868991375, "learning_rate": 1e-06, "loss": 0.0386, "step": 2030 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2924428731203079, "epoch": 3.2548076923076925, "grad_norm": 0.005939020775258541, "learning_rate": 1e-06, "loss": -0.0084, "step": 2031 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.26891082525253296, "epoch": 3.2564102564102564, "grad_norm": 0.010626004077494144, "learning_rate": 1e-06, "loss": 0.1295, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 1786.677734375, "completions/mean_terminated_length": 1315.79638671875, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "entropy": 0.29806482791900635, "epoch": 3.2580128205128207, "frac_reward_zero_std": 0.40625, "grad_norm": 0.012230968102812767, "learning_rate": 1e-06, "loss": -0.0145, "num_tokens": 1228425196.0, "reward": 0.32576221227645874, "reward_std": 0.05162687227129936, "rewards/progression_diversity/mean": -0.010695156641304493, "rewards/progression_diversity/std": 0.05995798856019974, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.717089831829071, "rewards/symbolic_reward_partial_score/std": 0.22520095109939575, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0235651731491089, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 23.860057830810547, "step": 2033 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2714930325746536, "epoch": 3.2596153846153846, "grad_norm": 0.009989427402615547, "learning_rate": 1e-06, "loss": 0.145, "step": 2034 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2667497843503952, "epoch": 3.261217948717949, "grad_norm": 0.01684493198990822, "learning_rate": 1e-06, "loss": 0.1274, "step": 2035 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2843262106180191, "epoch": 3.2628205128205128, "grad_norm": 0.014857680536806583, "learning_rate": 1e-06, "loss": 0.0398, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3595.0, "completions/mean_length": 1610.28125, "completions/mean_terminated_length": 1345.9403076171875, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "entropy": 0.2858640253543854, "epoch": 3.264423076923077, "frac_reward_zero_std": 0.53125, "grad_norm": 336.9473571777344, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 1230085228.0, "reward": 0.3623395264148712, "reward_std": 0.04071733355522156, "rewards/progression_diversity/mean": -0.005795993376523256, "rewards/progression_diversity/std": 0.04330339655280113, "rewards/symbolic_reward_accuracy/mean": 0.228515625, "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, "rewards/symbolic_reward_partial_score/mean": 0.7535644173622131, "rewards/symbolic_reward_partial_score/std": 0.21194274723529816, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045485258102417, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 12.358026504516602, "step": 2037 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2867206782102585, "epoch": 3.266025641025641, "grad_norm": 0.012359599582850933, "learning_rate": 1e-06, "loss": 0.0326, "step": 2038 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.279480904340744, "epoch": 3.2676282051282053, "grad_norm": 0.009706364013254642, "learning_rate": 1e-06, "loss": 0.0751, "step": 2039 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2926556468009949, "epoch": 3.269230769230769, "grad_norm": 0.013265096582472324, "learning_rate": 1e-06, "loss": 0.0111, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3382.0, "completions/mean_length": 1684.5390625, "completions/mean_terminated_length": 1361.79638671875, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "entropy": 0.27776144444942474, "epoch": 3.2708333333333335, "frac_reward_zero_std": 0.46875, "grad_norm": 754.3441162109375, "learning_rate": 1e-06, "loss": 0.0869, "num_tokens": 1231778640.0, "reward": 0.43440812826156616, "reward_std": 0.034479107707738876, "rewards/progression_diversity/mean": -0.00694271782413125, "rewards/progression_diversity/std": 0.04713256284594536, "rewards/symbolic_reward_accuracy/mean": 0.326171875, "rewards/symbolic_reward_accuracy/std": 0.4692695140838623, "rewards/symbolic_reward_partial_score/mean": 0.7978678941726685, "rewards/symbolic_reward_partial_score/std": 0.1925002634525299, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.036149501800537, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 17.656431198120117, "step": 2041 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29258349537849426, "epoch": 3.2724358974358974, "grad_norm": 0.03969258442521095, "learning_rate": 1e-06, "loss": -0.0049, "step": 2042 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2861692011356354, "epoch": 3.2740384615384617, "grad_norm": 0.007797840982675552, "learning_rate": 1e-06, "loss": 0.0038, "step": 2043 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2861800342798233, "epoch": 3.2756410256410255, "grad_norm": 0.024233318865299225, "learning_rate": 1e-06, "loss": 0.0363, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 1453.421875, "completions/mean_terminated_length": 1335.8582763671875, "completions/min_length": 547.0, "completions/min_terminated_length": 547.0, "entropy": 0.29494452476501465, "epoch": 3.27724358974359, "frac_reward_zero_std": 0.40625, "grad_norm": 475.1607971191406, "learning_rate": 1e-06, "loss": 0.0204, "num_tokens": 1233400632.0, "reward": 0.3379926085472107, "reward_std": 0.04110347479581833, "rewards/progression_diversity/mean": -0.00249581434763968, "rewards/progression_diversity/std": 0.027295051142573357, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.7068033814430237, "rewards/symbolic_reward_partial_score/std": 0.20671771466732025, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.05837082862854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 6.645764350891113, "step": 2045 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.29623962938785553, "epoch": 3.2788461538461537, "grad_norm": 0.015234522521495819, "learning_rate": 1e-06, "loss": 0.0231, "step": 2046 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2966182827949524, "epoch": 3.280448717948718, "grad_norm": 0.028103666380047798, "learning_rate": 1e-06, "loss": 0.0202, "step": 2047 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.29752831161022186, "epoch": 3.282051282051282, "grad_norm": 0.016002511605620384, "learning_rate": 1e-06, "loss": 0.0205, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 1659.55859375, "completions/mean_terminated_length": 1396.099365234375, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "entropy": 0.3017704486846924, "epoch": 3.2836538461538463, "frac_reward_zero_std": 0.59375, "grad_norm": 0.022180970758199692, "learning_rate": 1e-06, "loss": 0.0183, "num_tokens": 1235048998.0, "reward": 0.34624069929122925, "reward_std": 0.03489365801215172, "rewards/progression_diversity/mean": -0.005324858706444502, "rewards/progression_diversity/std": 0.0404798686504364, "rewards/symbolic_reward_accuracy/mean": 0.20703125, "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, "rewards/symbolic_reward_partial_score/mean": 0.7448079586029053, "rewards/symbolic_reward_partial_score/std": 0.2022588700056076, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0514497756958008, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 10.927322387695312, "step": 2049 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3079407513141632, "epoch": 3.28525641025641, "grad_norm": 1578.2557373046875, "learning_rate": 1e-06, "loss": 0.112, "step": 2050 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.29569441080093384, "epoch": 3.2868589743589745, "grad_norm": 0.012577169574797153, "learning_rate": 1e-06, "loss": 0.0704, "step": 2051 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2986031025648117, "epoch": 3.2884615384615383, "grad_norm": 0.0136114452034235, "learning_rate": 1e-06, "loss": 0.0306, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1793.57421875, "completions/mean_terminated_length": 1443.404052734375, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "entropy": 0.306133508682251, "epoch": 3.2900641025641026, "frac_reward_zero_std": 0.3125, "grad_norm": 395.223876953125, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 1236814236.0, "reward": 0.4016110897064209, "reward_std": 0.042516425251960754, "rewards/progression_diversity/mean": -0.007838626392185688, "rewards/progression_diversity/std": 0.05088994279503822, "rewards/symbolic_reward_accuracy/mean": 0.267578125, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.8090169429779053, "rewards/symbolic_reward_partial_score/std": 0.18975462019443512, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0399129390716553, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 16.826345443725586, "step": 2053 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.304185152053833, "epoch": 3.2916666666666665, "grad_norm": 0.020139280706644058, "learning_rate": 1e-06, "loss": 0.0203, "step": 2054 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.28031550347805023, "epoch": 3.293269230769231, "grad_norm": 0.008938499726355076, "learning_rate": 1e-06, "loss": 0.2255, "step": 2055 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.29509237408638, "epoch": 3.2948717948717947, "grad_norm": 0.008660328574478626, "learning_rate": 1e-06, "loss": 0.0369, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 1667.703125, "completions/mean_terminated_length": 1434.1112060546875, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "entropy": 0.2912152111530304, "epoch": 3.296474358974359, "frac_reward_zero_std": 0.375, "grad_norm": 549.008056640625, "learning_rate": 1e-06, "loss": 0.0571, "num_tokens": 1238580100.0, "reward": 0.28469109535217285, "reward_std": 0.039379484951496124, "rewards/progression_diversity/mean": -0.004526130389422178, "rewards/progression_diversity/std": 0.03515040501952171, "rewards/symbolic_reward_accuracy/mean": 0.119140625, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.7153971195220947, "rewards/symbolic_reward_partial_score/std": 0.1853335201740265, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0521401166915894, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 10.635208129882812, "step": 2057 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2999632656574249, "epoch": 3.298076923076923, "grad_norm": 0.015014038421213627, "learning_rate": 1e-06, "loss": 0.0142, "step": 2058 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2982807457447052, "epoch": 3.2996794871794872, "grad_norm": 0.021116536110639572, "learning_rate": 1e-06, "loss": 0.0415, "step": 2059 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.30165646970272064, "epoch": 3.301282051282051, "grad_norm": 0.02505728229880333, "learning_rate": 1e-06, "loss": 0.0484, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 1967.10546875, "completions/mean_terminated_length": 1441.7935791015625, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "entropy": 0.28198811411857605, "epoch": 3.3028846153846154, "frac_reward_zero_std": 0.3125, "grad_norm": 2657.8173828125, "learning_rate": 1e-06, "loss": 0.1443, "num_tokens": 1240427130.0, "reward": 0.38248294591903687, "reward_std": 0.058625657111406326, "rewards/progression_diversity/mean": -0.010982503183186054, "rewards/progression_diversity/std": 0.05810944363474846, "rewards/symbolic_reward_accuracy/mean": 0.271484375, "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, "rewards/symbolic_reward_partial_score/mean": 0.7408040761947632, "rewards/symbolic_reward_partial_score/std": 0.22931715846061707, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0257844924926758, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 24.58890151977539, "step": 2061 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.305512472987175, "epoch": 3.3044871794871793, "grad_norm": 0.010081891901791096, "learning_rate": 1e-06, "loss": 0.0085, "step": 2062 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.30105258524417877, "epoch": 3.3060897435897436, "grad_norm": 0.019923444837331772, "learning_rate": 1e-06, "loss": 0.0702, "step": 2063 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2942555248737335, "epoch": 3.3076923076923075, "grad_norm": 0.01970398612320423, "learning_rate": 1e-06, "loss": 0.0693, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 1714.302734375, "completions/mean_terminated_length": 1481.450439453125, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "entropy": 0.3109530508518219, "epoch": 3.309294871794872, "frac_reward_zero_std": 0.375, "grad_norm": 0.02412906475365162, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 1242207125.0, "reward": 0.36652231216430664, "reward_std": 0.056298792362213135, "rewards/progression_diversity/mean": -0.005487251095473766, "rewards/progression_diversity/std": 0.04296133294701576, "rewards/symbolic_reward_accuracy/mean": 0.25, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.7264810800552368, "rewards/symbolic_reward_partial_score/std": 0.2101718783378601, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0529332160949707, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 11.437311172485352, "step": 2065 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2926006019115448, "epoch": 3.310897435897436, "grad_norm": 0.01898755133152008, "learning_rate": 1e-06, "loss": 0.0776, "step": 2066 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3063723295927048, "epoch": 3.3125, "grad_norm": 0.014987506903707981, "learning_rate": 1e-06, "loss": 0.0091, "step": 2067 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.29933975636959076, "epoch": 3.314102564102564, "grad_norm": 0.020037300884723663, "learning_rate": 1e-06, "loss": 0.0595, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 1686.85546875, "completions/mean_terminated_length": 1453.5675048828125, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "entropy": 0.3088431656360626, "epoch": 3.315705128205128, "frac_reward_zero_std": 0.5, "grad_norm": 310.1370544433594, "learning_rate": 1e-06, "loss": 0.0196, "num_tokens": 1243928939.0, "reward": 0.328504741191864, "reward_std": 0.027106130495667458, "rewards/progression_diversity/mean": -0.005487233866006136, "rewards/progression_diversity/std": 0.04299917817115784, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.7319173216819763, "rewards/symbolic_reward_partial_score/std": 0.20580171048641205, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0549488067626953, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 10.485675811767578, "step": 2069 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3059859573841095, "epoch": 3.3173076923076925, "grad_norm": 178.32826232910156, "learning_rate": 1e-06, "loss": 0.0184, "step": 2070 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.29551123082637787, "epoch": 3.3189102564102564, "grad_norm": 0.01883271336555481, "learning_rate": 1e-06, "loss": 0.0948, "step": 2071 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.30632631480693817, "epoch": 3.3205128205128207, "grad_norm": 0.015535010024905205, "learning_rate": 1e-06, "loss": 0.005, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 1778.39453125, "completions/mean_terminated_length": 1487.4462890625, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "entropy": 0.30395227670669556, "epoch": 3.3221153846153846, "frac_reward_zero_std": 0.40625, "grad_norm": 893.9067993164062, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 1245691061.0, "reward": 0.36694616079330444, "reward_std": 0.03838976100087166, "rewards/progression_diversity/mean": -0.006556159816682339, "rewards/progression_diversity/std": 0.04625479876995087, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.7370442152023315, "rewards/symbolic_reward_partial_score/std": 0.21306481957435608, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0472426414489746, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 14.168490409851074, "step": 2073 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2978048026561737, "epoch": 3.323717948717949, "grad_norm": 10739.20703125, "learning_rate": 1e-06, "loss": 0.3728, "step": 2074 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30055759847164154, "epoch": 3.3253205128205128, "grad_norm": 0.01299914252012968, "learning_rate": 1e-06, "loss": 0.0633, "step": 2075 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.30361442267894745, "epoch": 3.326923076923077, "grad_norm": 0.014193286187946796, "learning_rate": 1e-06, "loss": 0.0325, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3344.0, "completions/mean_length": 1754.5703125, "completions/mean_terminated_length": 1463.1474609375, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "entropy": 0.2972850501537323, "epoch": 3.328525641025641, "frac_reward_zero_std": 0.5625, "grad_norm": 378.9573974609375, "learning_rate": 1e-06, "loss": 0.0103, "num_tokens": 1247440089.0, "reward": 0.43345290422439575, "reward_std": 0.04101278632879257, "rewards/progression_diversity/mean": -0.005784530192613602, "rewards/progression_diversity/std": 0.04120200499892235, "rewards/symbolic_reward_accuracy/mean": 0.3359375, "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, "rewards/symbolic_reward_partial_score/mean": 0.778369128704071, "rewards/symbolic_reward_partial_score/std": 0.21870289742946625, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049149513244629, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 12.310261726379395, "step": 2077 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.29869405925273895, "epoch": 3.3301282051282053, "grad_norm": 0.006345598492771387, "learning_rate": 1e-06, "loss": 0.0603, "step": 2078 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2994239032268524, "epoch": 3.331730769230769, "grad_norm": 0.013875186443328857, "learning_rate": 1e-06, "loss": 0.0434, "step": 2079 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2941579520702362, "epoch": 3.3333333333333335, "grad_norm": 0.03364482894539833, "learning_rate": 1e-06, "loss": 0.0486, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 1733.458984375, "completions/mean_terminated_length": 1471.322021484375, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "entropy": 0.29007087647914886, "epoch": 3.3349358974358974, "frac_reward_zero_std": 0.34375, "grad_norm": 648.8250732421875, "learning_rate": 1e-06, "loss": 0.0486, "num_tokens": 1249261396.0, "reward": 0.32781529426574707, "reward_std": 0.039317332208156586, "rewards/progression_diversity/mean": -0.006068596616387367, "rewards/progression_diversity/std": 0.0454968586564064, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.7361490726470947, "rewards/symbolic_reward_partial_score/std": 0.19703876972198486, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.049651026725769, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 12.473758697509766, "step": 2081 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.29181039333343506, "epoch": 3.3365384615384617, "grad_norm": 1459.4117431640625, "learning_rate": 1e-06, "loss": 0.1145, "step": 2082 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.301235556602478, "epoch": 3.3381410256410255, "grad_norm": 0.025303874164819717, "learning_rate": 1e-06, "loss": -0.003, "step": 2083 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.29415562748908997, "epoch": 3.33974358974359, "grad_norm": 0.023084204643964767, "learning_rate": 1e-06, "loss": -0.0009, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 1614.404296875, "completions/mean_terminated_length": 1439.270751953125, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "entropy": 0.29135817289352417, "epoch": 3.3413461538461537, "frac_reward_zero_std": 0.59375, "grad_norm": 2127.327392578125, "learning_rate": 1e-06, "loss": 0.0704, "num_tokens": 1250959283.0, "reward": 0.36038675904273987, "reward_std": 0.021374184638261795, "rewards/progression_diversity/mean": -0.0038059065118432045, "rewards/progression_diversity/std": 0.03501441702246666, "rewards/symbolic_reward_accuracy/mean": 0.248046875, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.7085774540901184, "rewards/symbolic_reward_partial_score/std": 0.2277929186820984, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0553886890411377, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 9.732671737670898, "step": 2085 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.30615630745887756, "epoch": 3.342948717948718, "grad_norm": 0.018460825085639954, "learning_rate": 1e-06, "loss": 0.0185, "step": 2086 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30728329718112946, "epoch": 3.344551282051282, "grad_norm": 0.009494633413851261, "learning_rate": 1e-06, "loss": 0.0159, "step": 2087 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.31128981709480286, "epoch": 3.3461538461538463, "grad_norm": 0.007844570092856884, "learning_rate": 1e-06, "loss": -0.0016, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 1687.4375, "completions/mean_terminated_length": 1454.1588134765625, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "entropy": 0.309854656457901, "epoch": 3.34775641025641, "frac_reward_zero_std": 0.46875, "grad_norm": 0.018283460289239883, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 1252780163.0, "reward": 0.3683873414993286, "reward_std": 0.03664931654930115, "rewards/progression_diversity/mean": -0.0050177741795778275, "rewards/progression_diversity/std": 0.03996478021144867, "rewards/symbolic_reward_accuracy/mean": 0.25, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.7300781011581421, "rewards/symbolic_reward_partial_score/std": 0.2158641666173935, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0487573146820068, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 13.234875679016113, "step": 2089 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2977482080459595, "epoch": 3.3493589743589745, "grad_norm": 3524.53466796875, "learning_rate": 1e-06, "loss": 0.2433, "step": 2090 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.30203960835933685, "epoch": 3.3509615384615383, "grad_norm": 0.010414248332381248, "learning_rate": 1e-06, "loss": 0.0241, "step": 2091 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30492982268333435, "epoch": 3.3525641025641026, "grad_norm": 0.02597237005829811, "learning_rate": 1e-06, "loss": 0.0162, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 1726.21484375, "completions/mean_terminated_length": 1463.9482421875, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "entropy": 0.30012136697769165, "epoch": 3.3541666666666665, "frac_reward_zero_std": 0.4375, "grad_norm": 42.711727142333984, "learning_rate": 1e-06, "loss": 0.0384, "num_tokens": 1254551649.0, "reward": 0.33932000398635864, "reward_std": 0.04096568375825882, "rewards/progression_diversity/mean": -0.005503435619175434, "rewards/progression_diversity/std": 0.041275136172771454, "rewards/symbolic_reward_accuracy/mean": 0.203125, "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, "rewards/symbolic_reward_partial_score/mean": 0.7276041507720947, "rewards/symbolic_reward_partial_score/std": 0.1983337253332138, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0483951568603516, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 13.683696746826172, "step": 2093 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3052857369184494, "epoch": 3.355769230769231, "grad_norm": 0.014036121778190136, "learning_rate": 1e-06, "loss": 0.0409, "step": 2094 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.304688423871994, "epoch": 3.3573717948717947, "grad_norm": 0.19088442623615265, "learning_rate": 1e-06, "loss": 0.0153, "step": 2095 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3058953136205673, "epoch": 3.358974358974359, "grad_norm": 0.011440307833254337, "learning_rate": 1e-06, "loss": 0.0268, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 1805.677734375, "completions/mean_terminated_length": 1515.27294921875, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "entropy": 0.3070167005062103, "epoch": 3.360576923076923, "frac_reward_zero_std": 0.65625, "grad_norm": 1267.1373291015625, "learning_rate": 1e-06, "loss": 0.0541, "num_tokens": 1256251692.0, "reward": 0.40323367714881897, "reward_std": 0.047273900359869, "rewards/progression_diversity/mean": -0.0057373507879674435, "rewards/progression_diversity/std": 0.041116341948509216, "rewards/symbolic_reward_accuracy/mean": 0.283203125, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.7844075560569763, "rewards/symbolic_reward_partial_score/std": 0.19797967374324799, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0512890815734863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 12.443960189819336, "step": 2097 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3034556806087494, "epoch": 3.3621794871794872, "grad_norm": 0.012835121713578701, "learning_rate": 1e-06, "loss": 0.0352, "step": 2098 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.3106335401535034, "epoch": 3.363782051282051, "grad_norm": 0.007598002441227436, "learning_rate": 1e-06, "loss": 0.0361, "step": 2099 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.30908720195293427, "epoch": 3.3653846153846154, "grad_norm": 0.006704273633658886, "learning_rate": 1e-06, "loss": 0.0375, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 1834.38671875, "completions/mean_terminated_length": 1485.196044921875, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "entropy": 0.30240941047668457, "epoch": 3.3669871794871793, "frac_reward_zero_std": 0.5, "grad_norm": 343.7887268066406, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 1258104818.0, "reward": 0.35596776008605957, "reward_std": 0.044219110161066055, "rewards/progression_diversity/mean": -0.008694911375641823, "rewards/progression_diversity/std": 0.0567040778696537, "rewards/symbolic_reward_accuracy/mean": 0.208984375, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.775390625, "rewards/symbolic_reward_partial_score/std": 0.20388461649417877, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0403454303741455, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 17.884170532226562, "step": 2101 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.30909933149814606, "epoch": 3.3685897435897436, "grad_norm": 0.010506085120141506, "learning_rate": 1e-06, "loss": 0.0457, "step": 2102 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.29592080414295197, "epoch": 3.3701923076923075, "grad_norm": 0.15130743384361267, "learning_rate": 1e-06, "loss": 0.0234, "step": 2103 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.29659251868724823, "epoch": 3.371794871794872, "grad_norm": 0.009021518751978874, "learning_rate": 1e-06, "loss": 0.0793, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 1713.470703125, "completions/mean_terminated_length": 1510.1168212890625, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "entropy": 0.3099695146083832, "epoch": 3.373397435897436, "frac_reward_zero_std": 0.40625, "grad_norm": 328.6873779296875, "learning_rate": 1e-06, "loss": 0.0443, "num_tokens": 1259801347.0, "reward": 0.3114762306213379, "reward_std": 0.029843464493751526, "rewards/progression_diversity/mean": -0.004722995683550835, "rewards/progression_diversity/std": 0.04021621122956276, "rewards/symbolic_reward_accuracy/mean": 0.16796875, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.7057291269302368, "rewards/symbolic_reward_partial_score/std": 0.2080366462469101, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0551323890686035, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 11.337949752807617, "step": 2105 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.31999480724334717, "epoch": 3.375, "grad_norm": 0.016840625554323196, "learning_rate": 1e-06, "loss": 0.0714, "step": 2106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.32205140590667725, "epoch": 3.376602564102564, "grad_norm": 0.014530826359987259, "learning_rate": 1e-06, "loss": -0.0033, "step": 2107 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3153335601091385, "epoch": 3.378205128205128, "grad_norm": 0.013645652681589127, "learning_rate": 1e-06, "loss": 0.0475, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2217.0, "completions/mean_length": 1905.322265625, "completions/mean_terminated_length": 1528.122314453125, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "entropy": 0.29995962977409363, "epoch": 3.3798076923076925, "frac_reward_zero_std": 0.3125, "grad_norm": 1299.5802001953125, "learning_rate": 1e-06, "loss": 0.0862, "num_tokens": 1261670184.0, "reward": 0.3757146894931793, "reward_std": 0.06831994652748108, "rewards/progression_diversity/mean": -0.008122788742184639, "rewards/progression_diversity/std": 0.05085650831460953, "rewards/symbolic_reward_accuracy/mean": 0.267578125, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.7253092527389526, "rewards/symbolic_reward_partial_score/std": 0.23041704297065735, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0357104539871216, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 20.614686965942383, "step": 2109 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30187252163887024, "epoch": 3.3814102564102564, "grad_norm": 0.9350156188011169, "learning_rate": 1e-06, "loss": 0.033, "step": 2110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3068780303001404, "epoch": 3.3830128205128207, "grad_norm": 0.010901644825935364, "learning_rate": 1e-06, "loss": 0.0526, "step": 2111 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.29835619032382965, "epoch": 3.3846153846153846, "grad_norm": 16258.962890625, "learning_rate": 1e-06, "loss": 1.5092, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2335.0, "completions/mean_length": 1839.962890625, "completions/mean_terminated_length": 1550.2410888671875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "entropy": 0.3106384575366974, "epoch": 3.386217948717949, "frac_reward_zero_std": 0.46875, "grad_norm": 982.4223022460938, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 1263444485.0, "reward": 0.4431688189506531, "reward_std": 0.059487175196409225, "rewards/progression_diversity/mean": -0.005386716220527887, "rewards/progression_diversity/std": 0.03828057646751404, "rewards/symbolic_reward_accuracy/mean": 0.357421875, "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, "rewards/symbolic_reward_partial_score/mean": 0.7671223878860474, "rewards/symbolic_reward_partial_score/std": 0.2305992692708969, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045996904373169, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 15.037046432495117, "step": 2113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3080595135688782, "epoch": 3.3878205128205128, "grad_norm": 0.011529210954904556, "learning_rate": 1e-06, "loss": 0.0223, "step": 2114 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3010105937719345, "epoch": 3.389423076923077, "grad_norm": 2453.80712890625, "learning_rate": 1e-06, "loss": 0.1075, "step": 2115 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.29755890369415283, "epoch": 3.391025641025641, "grad_norm": 0.02169516310095787, "learning_rate": 1e-06, "loss": 0.0655, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 1782.4921875, "completions/mean_terminated_length": 1550.7222900390625, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "entropy": 0.3025133013725281, "epoch": 3.3926282051282053, "frac_reward_zero_std": 0.40625, "grad_norm": 774.7922973632812, "learning_rate": 1e-06, "loss": 0.0502, "num_tokens": 1265197169.0, "reward": 0.38823676109313965, "reward_std": 0.03497297316789627, "rewards/progression_diversity/mean": -0.0049418737180531025, "rewards/progression_diversity/std": 0.038801729679107666, "rewards/symbolic_reward_accuracy/mean": 0.26953125, "rewards/symbolic_reward_accuracy/std": 0.44415023922920227, "rewards/symbolic_reward_partial_score/mean": 0.7578287720680237, "rewards/symbolic_reward_partial_score/std": 0.20313479006290436, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0485944747924805, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 13.493077278137207, "step": 2117 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31337393820285797, "epoch": 3.394230769230769, "grad_norm": 0.024296533316373825, "learning_rate": 1e-06, "loss": -0.0042, "step": 2118 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.30753542482852936, "epoch": 3.3958333333333335, "grad_norm": 0.02327045053243637, "learning_rate": 1e-06, "loss": -0.0024, "step": 2119 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.29232731461524963, "epoch": 3.3974358974358974, "grad_norm": 0.015478910878300667, "learning_rate": 1e-06, "loss": 0.0821, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 1758.404296875, "completions/mean_terminated_length": 1526.2520751953125, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "entropy": 0.2938903123140335, "epoch": 3.3990384615384617, "frac_reward_zero_std": 0.28125, "grad_norm": 500.75775146484375, "learning_rate": 1e-06, "loss": 0.0747, "num_tokens": 1267013136.0, "reward": 0.30194687843322754, "reward_std": 0.0503971166908741, "rewards/progression_diversity/mean": -0.005022912751883268, "rewards/progression_diversity/std": 0.04021916538476944, "rewards/symbolic_reward_accuracy/mean": 0.14453125, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.7201985716819763, "rewards/symbolic_reward_partial_score/std": 0.20605868101119995, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.051767349243164, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 11.300193786621094, "step": 2121 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3104073405265808, "epoch": 3.4006410256410255, "grad_norm": 0.026015326380729675, "learning_rate": 1e-06, "loss": 0.0001, "step": 2122 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30487897992134094, "epoch": 3.40224358974359, "grad_norm": 0.022471783682703972, "learning_rate": 1e-06, "loss": 0.0026, "step": 2123 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.299276664853096, "epoch": 3.4038461538461537, "grad_norm": 0.022479599341750145, "learning_rate": 1e-06, "loss": 0.0366, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3687.0, "completions/mean_length": 1766.92578125, "completions/mean_terminated_length": 1564.3128662109375, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "entropy": 0.30581988394260406, "epoch": 3.405448717948718, "frac_reward_zero_std": 0.375, "grad_norm": 0.0528433695435524, "learning_rate": 1e-06, "loss": 0.037, "num_tokens": 1268731466.0, "reward": 0.3584764301776886, "reward_std": 0.056666046380996704, "rewards/progression_diversity/mean": -0.003432408208027482, "rewards/progression_diversity/std": 0.0318170040845871, "rewards/symbolic_reward_accuracy/mean": 0.234375, "rewards/symbolic_reward_accuracy/std": 0.42402184009552, "rewards/symbolic_reward_partial_score/mean": 0.7308430671691895, "rewards/symbolic_reward_partial_score/std": 0.21932339668273926, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0599393844604492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 7.578122138977051, "step": 2125 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3097575902938843, "epoch": 3.407051282051282, "grad_norm": 0.023739317432045937, "learning_rate": 1e-06, "loss": 0.0415, "step": 2126 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31125499308109283, "epoch": 3.4086538461538463, "grad_norm": 0.012953310273587704, "learning_rate": 1e-06, "loss": 0.0353, "step": 2127 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.31313419342041016, "epoch": 3.41025641025641, "grad_norm": 0.013461051508784294, "learning_rate": 1e-06, "loss": 0.0162, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 1800.833984375, "completions/mean_terminated_length": 1539.9024658203125, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "entropy": 0.3056839108467102, "epoch": 3.4118589743589745, "frac_reward_zero_std": 0.4375, "grad_norm": 0.035352922976017, "learning_rate": 1e-06, "loss": 0.0161, "num_tokens": 1270457349.0, "reward": 0.45632848143577576, "reward_std": 0.054405439645051956, "rewards/progression_diversity/mean": -0.005339231342077255, "rewards/progression_diversity/std": 0.04081280156970024, "rewards/symbolic_reward_accuracy/mean": 0.359375, "rewards/symbolic_reward_accuracy/std": 0.48028653860092163, "rewards/symbolic_reward_partial_score/mean": 0.8057780265808105, "rewards/symbolic_reward_partial_score/std": 0.20670460164546967, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.054563045501709, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 9.924196243286133, "step": 2129 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29381316900253296, "epoch": 3.4134615384615383, "grad_norm": 11953.2880859375, "learning_rate": 1e-06, "loss": 0.3212, "step": 2130 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2993968427181244, "epoch": 3.4150641025641026, "grad_norm": 0.016664206981658936, "learning_rate": 1e-06, "loss": 0.0118, "step": 2131 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.29864759743213654, "epoch": 3.4166666666666665, "grad_norm": 1193.7196044921875, "learning_rate": 1e-06, "loss": 0.0213, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 2058.951171875, "completions/mean_terminated_length": 1506.8701171875, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "entropy": 0.2732394188642502, "epoch": 3.418269230769231, "frac_reward_zero_std": 0.375, "grad_norm": 527.1871337890625, "learning_rate": 1e-06, "loss": 0.0862, "num_tokens": 1272399164.0, "reward": 0.33772745728492737, "reward_std": 0.049203261733055115, "rewards/progression_diversity/mean": -0.010945793241262436, "rewards/progression_diversity/std": 0.0561109222471714, "rewards/symbolic_reward_accuracy/mean": 0.21484375, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.7062011957168579, "rewards/symbolic_reward_partial_score/std": 0.2347198873758316, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0237281322479248, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 24.222614288330078, "step": 2133 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2859821915626526, "epoch": 3.4198717948717947, "grad_norm": 0.02516080252826214, "learning_rate": 1e-06, "loss": -0.0053, "step": 2134 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2708042562007904, "epoch": 3.421474358974359, "grad_norm": 0.01243208535015583, "learning_rate": 1e-06, "loss": 0.077, "step": 2135 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.26599569618701935, "epoch": 3.423076923076923, "grad_norm": 0.008642677217721939, "learning_rate": 1e-06, "loss": 0.1548, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 1775.25, "completions/mean_terminated_length": 1484.2391357421875, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "entropy": 0.2837539613246918, "epoch": 3.4246794871794872, "frac_reward_zero_std": 0.4375, "grad_norm": 216.27127075195312, "learning_rate": 1e-06, "loss": 0.0381, "num_tokens": 1274171948.0, "reward": 0.28258827328681946, "reward_std": 0.03927141800522804, "rewards/progression_diversity/mean": -0.00484531931579113, "rewards/progression_diversity/std": 0.036202192306518555, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.7051432132720947, "rewards/symbolic_reward_partial_score/std": 0.19589273631572723, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0403019189834595, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 15.552392959594727, "step": 2137 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.27445077896118164, "epoch": 3.426282051282051, "grad_norm": 0.011864651925861835, "learning_rate": 1e-06, "loss": 0.093, "step": 2138 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2975247800350189, "epoch": 3.4278846153846154, "grad_norm": 0.012054034508764744, "learning_rate": 1e-06, "loss": -0.0002, "step": 2139 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.2823929637670517, "epoch": 3.4294871794871793, "grad_norm": 0.015474159270524979, "learning_rate": 1e-06, "loss": 0.0769, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 1778.083984375, "completions/mean_terminated_length": 1516.7454833984375, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "entropy": 0.28506043553352356, "epoch": 3.4310897435897436, "frac_reward_zero_std": 0.46875, "grad_norm": 0.018360108137130737, "learning_rate": 1e-06, "loss": -0.007, "num_tokens": 1275971479.0, "reward": 0.3357412815093994, "reward_std": 0.02756696194410324, "rewards/progression_diversity/mean": -0.004975297022610903, "rewards/progression_diversity/std": 0.03728037327528, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.7488607168197632, "rewards/symbolic_reward_partial_score/std": 0.1986795961856842, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0461030006408691, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 12.876171112060547, "step": 2141 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2805647701025009, "epoch": 3.4326923076923075, "grad_norm": 0.014698946848511696, "learning_rate": 1e-06, "loss": 0.0504, "step": 2142 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.286907359957695, "epoch": 3.434294871794872, "grad_norm": 0.020853351801633835, "learning_rate": 1e-06, "loss": 0.0685, "step": 2143 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.28968897461891174, "epoch": 3.435897435897436, "grad_norm": 0.010319734923541546, "learning_rate": 1e-06, "loss": 0.0058, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 1784.212890625, "completions/mean_terminated_length": 1522.9840087890625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "entropy": 0.3004123270511627, "epoch": 3.4375, "frac_reward_zero_std": 0.375, "grad_norm": 258.38421630859375, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 1277659044.0, "reward": 0.4109523296356201, "reward_std": 0.08106425404548645, "rewards/progression_diversity/mean": -0.005354299675673246, "rewards/progression_diversity/std": 0.04184706509113312, "rewards/symbolic_reward_accuracy/mean": 0.306640625, "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, "rewards/symbolic_reward_partial_score/mean": 0.7619466781616211, "rewards/symbolic_reward_partial_score/std": 0.23045581579208374, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0512118339538574, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 11.705215454101562, "step": 2145 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.29704779386520386, "epoch": 3.439102564102564, "grad_norm": 0.023789308965206146, "learning_rate": 1e-06, "loss": 0.0442, "step": 2146 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.29680609703063965, "epoch": 3.440705128205128, "grad_norm": 0.022177523002028465, "learning_rate": 1e-06, "loss": 0.038, "step": 2147 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.29306888580322266, "epoch": 3.4423076923076925, "grad_norm": 0.009412667714059353, "learning_rate": 1e-06, "loss": 0.058, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 1976.712890625, "completions/mean_terminated_length": 1571.688720703125, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "entropy": 0.2784651964902878, "epoch": 3.4439102564102564, "frac_reward_zero_std": 0.375, "grad_norm": 62.723731994628906, "learning_rate": 1e-06, "loss": 0.0358, "num_tokens": 1279531361.0, "reward": 0.4271858334541321, "reward_std": 0.03608609363436699, "rewards/progression_diversity/mean": -0.008470406755805016, "rewards/progression_diversity/std": 0.05123213678598404, "rewards/symbolic_reward_accuracy/mean": 0.3359375, "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, "rewards/symbolic_reward_partial_score/mean": 0.7549641728401184, "rewards/symbolic_reward_partial_score/std": 0.22292669117450714, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0334545373916626, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 19.96367835998535, "step": 2149 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.28350482881069183, "epoch": 3.4455128205128207, "grad_norm": 0.010959633626043797, "learning_rate": 1e-06, "loss": 0.07, "step": 2150 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2834584712982178, "epoch": 3.4471153846153846, "grad_norm": 0.014038468711078167, "learning_rate": 1e-06, "loss": 0.0019, "step": 2151 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.27503225207328796, "epoch": 3.448717948717949, "grad_norm": 0.03383004292845726, "learning_rate": 1e-06, "loss": 0.0429, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2406.0, "completions/mean_length": 2020.60546875, "completions/mean_terminated_length": 1557.2701416015625, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "entropy": 0.2737235426902771, "epoch": 3.4503205128205128, "frac_reward_zero_std": 0.375, "grad_norm": 0.028786683455109596, "learning_rate": 1e-06, "loss": 0.0172, "num_tokens": 1281467799.0, "reward": 0.34394705295562744, "reward_std": 0.05187678337097168, "rewards/progression_diversity/mean": -0.009591124951839447, "rewards/progression_diversity/std": 0.05538209527730942, "rewards/symbolic_reward_accuracy/mean": 0.201171875, "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, "rewards/symbolic_reward_partial_score/mean": 0.7490234375, "rewards/symbolic_reward_partial_score/std": 0.20375196635723114, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0315946340560913, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 20.09771728515625, "step": 2153 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2721341699361801, "epoch": 3.451923076923077, "grad_norm": 0.00952248927205801, "learning_rate": 1e-06, "loss": 0.1071, "step": 2154 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.27051886916160583, "epoch": 3.453525641025641, "grad_norm": 0.015221396461129189, "learning_rate": 1e-06, "loss": 0.0418, "step": 2155 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.26560334861278534, "epoch": 3.4551282051282053, "grad_norm": 0.010249440558254719, "learning_rate": 1e-06, "loss": 0.0637, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 2162.498046875, "completions/mean_terminated_length": 1584.38818359375, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "entropy": 0.266851544380188, "epoch": 3.456730769230769, "frac_reward_zero_std": 0.28125, "grad_norm": 609.803955078125, "learning_rate": 1e-06, "loss": 0.072, "num_tokens": 1283476326.0, "reward": 0.39281851053237915, "reward_std": 0.035197146236896515, "rewards/progression_diversity/mean": -0.013561811298131943, "rewards/progression_diversity/std": 0.06690501421689987, "rewards/symbolic_reward_accuracy/mean": 0.275390625, "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, "rewards/symbolic_reward_partial_score/mean": 0.7610189318656921, "rewards/symbolic_reward_partial_score/std": 0.19838300347328186, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0253196954727173, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 22.992713928222656, "step": 2157 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.269494891166687, "epoch": 3.4583333333333335, "grad_norm": 0.024904318153858185, "learning_rate": 1e-06, "loss": 0.0599, "step": 2158 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.25990206003189087, "epoch": 3.4599358974358974, "grad_norm": 0.01417806651443243, "learning_rate": 1e-06, "loss": 0.0332, "step": 2159 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2642335593700409, "epoch": 3.4615384615384617, "grad_norm": 0.00956464372575283, "learning_rate": 1e-06, "loss": 0.0442, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 2211.890625, "completions/mean_terminated_length": 1575.591796875, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "entropy": 0.2689553052186966, "epoch": 3.4631410256410255, "frac_reward_zero_std": 0.28125, "grad_norm": 659.0357666015625, "learning_rate": 1e-06, "loss": 0.0529, "num_tokens": 1285388046.0, "reward": 0.28991633653640747, "reward_std": 0.031822673976421356, "rewards/progression_diversity/mean": -0.012760378420352936, "rewards/progression_diversity/std": 0.0614657998085022, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.7272297739982605, "rewards/symbolic_reward_partial_score/std": 0.18316437304019928, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.018622875213623, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 27.381484985351562, "step": 2161 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.26476864516735077, "epoch": 3.46474358974359, "grad_norm": 0.013940541073679924, "learning_rate": 1e-06, "loss": 0.1169, "step": 2162 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2731598913669586, "epoch": 3.4663461538461537, "grad_norm": 77.1856918334961, "learning_rate": 1e-06, "loss": 0.0693, "step": 2163 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2830631732940674, "epoch": 3.467948717948718, "grad_norm": 0.026339799165725708, "learning_rate": 1e-06, "loss": 0.0181, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 2145.0546875, "completions/mean_terminated_length": 1566.2357177734375, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "entropy": 0.27289319038391113, "epoch": 3.469551282051282, "frac_reward_zero_std": 0.34375, "grad_norm": 247.32151794433594, "learning_rate": 1e-06, "loss": 0.0502, "num_tokens": 1287364458.0, "reward": 0.4520633816719055, "reward_std": 0.041082873940467834, "rewards/progression_diversity/mean": -0.013879441656172276, "rewards/progression_diversity/std": 0.06810557842254639, "rewards/symbolic_reward_accuracy/mean": 0.3671875, "rewards/symbolic_reward_accuracy/std": 0.48250964283943176, "rewards/symbolic_reward_partial_score/mean": 0.7729655504226685, "rewards/symbolic_reward_partial_score/std": 0.21098491549491882, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.033386468887329, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 19.480430603027344, "step": 2165 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2645433694124222, "epoch": 3.4711538461538463, "grad_norm": 0.010487830266356468, "learning_rate": 1e-06, "loss": 0.081, "step": 2166 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2667945772409439, "epoch": 3.47275641025641, "grad_norm": 0.012948272749781609, "learning_rate": 1e-06, "loss": 0.0463, "step": 2167 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2764432579278946, "epoch": 3.4743589743589745, "grad_norm": 0.016345176845788956, "learning_rate": 1e-06, "loss": 0.0114, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 2468.5, "completions/mean_terminated_length": 1540.800048828125, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "entropy": 0.2626621425151825, "epoch": 3.4759615384615383, "frac_reward_zero_std": 0.21875, "grad_norm": 77.64374542236328, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 1289573546.0, "reward": 0.23555369675159454, "reward_std": 0.02879556640982628, "rewards/progression_diversity/mean": -0.02080235444009304, "rewards/progression_diversity/std": 0.08193078637123108, "rewards/symbolic_reward_accuracy/mean": 0.05859375, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.6706380844116211, "rewards/symbolic_reward_partial_score/std": 0.17417360842227936, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0140269994735718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 29.442203521728516, "step": 2169 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.25798384845256805, "epoch": 3.4775641025641026, "grad_norm": 9.645312309265137, "learning_rate": 1e-06, "loss": 0.0937, "step": 2170 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.25725966691970825, "epoch": 3.4791666666666665, "grad_norm": 0.025352254509925842, "learning_rate": 1e-06, "loss": 0.0417, "step": 2171 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.26060421764850616, "epoch": 3.480769230769231, "grad_norm": 0.012905375100672245, "learning_rate": 1e-06, "loss": 0.0643, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 2100.515625, "completions/mean_terminated_length": 1550.0364990234375, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "entropy": 0.2711750864982605, "epoch": 3.4823717948717947, "frac_reward_zero_std": 0.4375, "grad_norm": 3850.08984375, "learning_rate": 1e-06, "loss": 0.0719, "num_tokens": 1291521378.0, "reward": 0.2990831136703491, "reward_std": 0.02100459486246109, "rewards/progression_diversity/mean": -0.012100995518267155, "rewards/progression_diversity/std": 0.06185305863618851, "rewards/symbolic_reward_accuracy/mean": 0.123046875, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.7525553107261658, "rewards/symbolic_reward_partial_score/std": 0.1686214655637741, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.03285813331604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 19.567764282226562, "step": 2173 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.26994919776916504, "epoch": 3.483974358974359, "grad_norm": 0.02370203286409378, "learning_rate": 1e-06, "loss": 0.039, "step": 2174 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.28110212087631226, "epoch": 3.485576923076923, "grad_norm": 0.02028624340891838, "learning_rate": 1e-06, "loss": 0.0025, "step": 2175 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2572477161884308, "epoch": 3.4871794871794872, "grad_norm": 0.007402379531413317, "learning_rate": 1e-06, "loss": 0.0899, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 1678.212890625, "completions/mean_terminated_length": 1533.1854248046875, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "entropy": 0.30332140624523163, "epoch": 3.488782051282051, "frac_reward_zero_std": 0.65625, "grad_norm": 0.013128817081451416, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 1293154063.0, "reward": 0.4060882329940796, "reward_std": 0.014026455581188202, "rewards/progression_diversity/mean": -0.0029925985727459192, "rewards/progression_diversity/std": 0.02976119890809059, "rewards/symbolic_reward_accuracy/mean": 0.310546875, "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, "rewards/symbolic_reward_partial_score/mean": 0.7332844734191895, "rewards/symbolic_reward_partial_score/std": 0.22508755326271057, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0598652362823486, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 6.740238666534424, "step": 2177 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2928498536348343, "epoch": 3.4903846153846154, "grad_norm": 1097.95703125, "learning_rate": 1e-06, "loss": 0.0812, "step": 2178 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.29989422857761383, "epoch": 3.4919871794871793, "grad_norm": 0.009113781154155731, "learning_rate": 1e-06, "loss": 0.0097, "step": 2179 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29661744832992554, "epoch": 3.4935897435897436, "grad_norm": 0.041792068630456924, "learning_rate": 1e-06, "loss": 0.0276, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2569.0, "completions/mean_length": 1740.89453125, "completions/mean_terminated_length": 1567.260986328125, "completions/min_length": 994.0, "completions/min_terminated_length": 994.0, "entropy": 0.28554295003414154, "epoch": 3.4951923076923075, "frac_reward_zero_std": 0.46875, "grad_norm": 380.84259033203125, "learning_rate": 1e-06, "loss": 0.0355, "num_tokens": 1294866057.0, "reward": 0.30031508207321167, "reward_std": 0.020207438617944717, "rewards/progression_diversity/mean": -0.004136052913963795, "rewards/progression_diversity/std": 0.03724845126271248, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.6932454109191895, "rewards/symbolic_reward_partial_score/std": 0.20394784212112427, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0567407608032227, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 7.55679988861084, "step": 2181 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2933173179626465, "epoch": 3.496794871794872, "grad_norm": 0.02578102797269821, "learning_rate": 1e-06, "loss": 0.0043, "step": 2182 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2895757555961609, "epoch": 3.498397435897436, "grad_norm": 0.016278348863124847, "learning_rate": 1e-06, "loss": -0.004, "step": 2183 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2855415791273117, "epoch": 3.5, "grad_norm": 0.013905326835811138, "learning_rate": 1e-06, "loss": 0.0135, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 1967.490234375, "completions/mean_terminated_length": 1621.4940185546875, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "entropy": 0.2863430380821228, "epoch": 3.501602564102564, "frac_reward_zero_std": 0.4375, "grad_norm": 103.70210266113281, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 1296736836.0, "reward": 0.31307274103164673, "reward_std": 0.031172282993793488, "rewards/progression_diversity/mean": -0.0066928681917488575, "rewards/progression_diversity/std": 0.04427814856171608, "rewards/symbolic_reward_accuracy/mean": 0.162109375, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.7208821177482605, "rewards/symbolic_reward_partial_score/std": 0.19237685203552246, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0420305728912354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 15.591737747192383, "step": 2185 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2772316485643387, "epoch": 3.503205128205128, "grad_norm": 268.4742736816406, "learning_rate": 1e-06, "loss": 0.0571, "step": 2186 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2852557599544525, "epoch": 3.5048076923076925, "grad_norm": 0.01805300824344158, "learning_rate": 1e-06, "loss": 0.0119, "step": 2187 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.288587287068367, "epoch": 3.5064102564102564, "grad_norm": 0.017689945176243782, "learning_rate": 1e-06, "loss": 0.0324, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2846.0, "completions/mean_length": 1944.544921875, "completions/mean_terminated_length": 1686.184814453125, "completions/min_length": 1121.0, "completions/min_terminated_length": 1121.0, "entropy": 0.27069567143917084, "epoch": 3.5080128205128203, "frac_reward_zero_std": 0.5625, "grad_norm": 196.1943359375, "learning_rate": 1e-06, "loss": 0.0453, "num_tokens": 1298707867.0, "reward": 0.40786558389663696, "reward_std": 0.02084019035100937, "rewards/progression_diversity/mean": -0.004458786454051733, "rewards/progression_diversity/std": 0.03238670900464058, "rewards/symbolic_reward_accuracy/mean": 0.30859375, "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, "rewards/symbolic_reward_partial_score/mean": 0.7438150644302368, "rewards/symbolic_reward_partial_score/std": 0.21741990745067596, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.052014708518982, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 9.943605422973633, "step": 2189 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.27081412076950073, "epoch": 3.5096153846153846, "grad_norm": 0.013844393193721771, "learning_rate": 1e-06, "loss": 0.3136, "step": 2190 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2878330200910568, "epoch": 3.511217948717949, "grad_norm": 0.013577915728092194, "learning_rate": 1e-06, "loss": -0.0015, "step": 2191 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.27837416529655457, "epoch": 3.5128205128205128, "grad_norm": 0.009212753735482693, "learning_rate": 1e-06, "loss": 0.0124, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3301.0, "completions/mean_length": 2025.873046875, "completions/mean_terminated_length": 1681.278076171875, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "entropy": 0.277935653924942, "epoch": 3.5144230769230766, "frac_reward_zero_std": 0.375, "grad_norm": 408.2825927734375, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 1300680634.0, "reward": 0.3115173280239105, "reward_std": 0.014852182939648628, "rewards/progression_diversity/mean": -0.005984636954963207, "rewards/progression_diversity/std": 0.039667367935180664, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.7267415523529053, "rewards/symbolic_reward_partial_score/std": 0.1806911826133728, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0477591753005981, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 12.26329231262207, "step": 2193 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.2869657427072525, "epoch": 3.516025641025641, "grad_norm": 0.009542958810925484, "learning_rate": 1e-06, "loss": 5.8881, "step": 2194 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2763095796108246, "epoch": 3.5176282051282053, "grad_norm": 68490.9453125, "learning_rate": 1e-06, "loss": 14.7655, "step": 2195 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.28788307309150696, "epoch": 3.519230769230769, "grad_norm": 0.022392556071281433, "learning_rate": 1e-06, "loss": 3.6531, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 1910.25390625, "completions/mean_terminated_length": 1680.511962890625, "completions/min_length": 1155.0, "completions/min_terminated_length": 1155.0, "entropy": 0.2990610897541046, "epoch": 3.5208333333333335, "frac_reward_zero_std": 0.53125, "grad_norm": 134.34463500976562, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 1302494364.0, "reward": 0.38631004095077515, "reward_std": 0.027447409927845, "rewards/progression_diversity/mean": -0.004252912942320108, "rewards/progression_diversity/std": 0.033583469688892365, "rewards/symbolic_reward_accuracy/mean": 0.275390625, "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, "rewards/symbolic_reward_partial_score/mean": 0.7383626699447632, "rewards/symbolic_reward_partial_score/std": 0.19246533513069153, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0586574077606201, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 7.78310489654541, "step": 2197 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30396056175231934, "epoch": 3.5224358974358974, "grad_norm": 0.016528572887182236, "learning_rate": 1e-06, "loss": -0.0008, "step": 2198 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.308067262172699, "epoch": 3.5240384615384617, "grad_norm": 0.01760723628103733, "learning_rate": 1e-06, "loss": -0.0059, "step": 2199 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2869449555873871, "epoch": 3.5256410256410255, "grad_norm": 835913.125, "learning_rate": 1e-06, "loss": 89.1969, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2777.0, "completions/mean_length": 1783.34375, "completions/mean_terminated_length": 1668.3779296875, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "entropy": 0.3045912981033325, "epoch": 3.52724358974359, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0315437950193882, "learning_rate": 1e-06, "loss": 0.0041, "num_tokens": 1304211468.0, "reward": 0.38467174768447876, "reward_std": 0.01725323125720024, "rewards/progression_diversity/mean": -0.002066644374281168, "rewards/progression_diversity/std": 0.023847082629799843, "rewards/symbolic_reward_accuracy/mean": 0.251953125, "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, "rewards/symbolic_reward_partial_score/mean": 0.779052734375, "rewards/symbolic_reward_partial_score/std": 0.19922928512096405, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0659763813018799, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 4.61309814453125, "step": 2201 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.29725050926208496, "epoch": 3.5288461538461537, "grad_norm": 1866.8873291015625, "learning_rate": 1e-06, "loss": 0.1574, "step": 2202 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2994164824485779, "epoch": 3.530448717948718, "grad_norm": 0.00927357655018568, "learning_rate": 1e-06, "loss": -0.0, "step": 2203 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.30384716391563416, "epoch": 3.532051282051282, "grad_norm": 0.011953097768127918, "learning_rate": 1e-06, "loss": -0.0006, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3051.0, "completions/mean_length": 1808.693359375, "completions/mean_terminated_length": 1635.8636474609375, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "entropy": 0.2852201759815216, "epoch": 3.5336538461538463, "frac_reward_zero_std": 0.625, "grad_norm": 128.33778381347656, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 1306004719.0, "reward": 0.3015816807746887, "reward_std": 0.009979461319744587, "rewards/progression_diversity/mean": -0.001987605821341276, "rewards/progression_diversity/std": 0.02013721875846386, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.7566406726837158, "rewards/symbolic_reward_partial_score/std": 0.17199444770812988, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0611894130706787, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 5.849099636077881, "step": 2205 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2893652319908142, "epoch": 3.53525641025641, "grad_norm": 0.021172404289245605, "learning_rate": 1e-06, "loss": 0.0304, "step": 2206 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.30061855912208557, "epoch": 3.5368589743589745, "grad_norm": 0.007474346086382866, "learning_rate": 1e-06, "loss": -0.0081, "step": 2207 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.28687790036201477, "epoch": 3.5384615384615383, "grad_norm": 0.008998925797641277, "learning_rate": 1e-06, "loss": 0.063, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 1647.38671875, "completions/mean_terminated_length": 1589.59619140625, "completions/min_length": 1078.0, "completions/min_terminated_length": 1078.0, "entropy": 0.2984195798635483, "epoch": 3.5400641025641026, "frac_reward_zero_std": 0.6875, "grad_norm": 0.022925743833184242, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 1307736341.0, "reward": 0.3372430205345154, "reward_std": 0.02868266962468624, "rewards/progression_diversity/mean": -0.0008010210585780442, "rewards/progression_diversity/std": 0.012307359836995602, "rewards/symbolic_reward_accuracy/mean": 0.193359375, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.7381021976470947, "rewards/symbolic_reward_partial_score/std": 0.18334569036960602, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0685679912567139, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 2.327254056930542, "step": 2209 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.292475089430809, "epoch": 3.5416666666666665, "grad_norm": 0.010455112904310226, "learning_rate": 1e-06, "loss": 0.0001, "step": 2210 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2838650196790695, "epoch": 3.543269230769231, "grad_norm": 0.0170154869556427, "learning_rate": 1e-06, "loss": 0.0223, "step": 2211 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2912348359823227, "epoch": 3.5448717948717947, "grad_norm": 0.014277713373303413, "learning_rate": 1e-06, "loss": -0.001, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 1732.833984375, "completions/mean_terminated_length": 1588.34521484375, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "entropy": 0.29485444724559784, "epoch": 3.546474358974359, "frac_reward_zero_std": 0.625, "grad_norm": 0.012255196459591389, "learning_rate": 1e-06, "loss": 0.031, "num_tokens": 1309492480.0, "reward": 0.40414494276046753, "reward_std": 0.017114289104938507, "rewards/progression_diversity/mean": -0.002500717993825674, "rewards/progression_diversity/std": 0.025870388373732567, "rewards/symbolic_reward_accuracy/mean": 0.27734375, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.7931965589523315, "rewards/symbolic_reward_partial_score/std": 0.17990581691265106, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0640480518341064, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 4.959805011749268, "step": 2213 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2949848175048828, "epoch": 3.5480769230769234, "grad_norm": 0.023037033155560493, "learning_rate": 1e-06, "loss": 0.0215, "step": 2214 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2907959371805191, "epoch": 3.5496794871794872, "grad_norm": 0.017563870176672935, "learning_rate": 1e-06, "loss": 0.0405, "step": 2215 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.29562048614025116, "epoch": 3.551282051282051, "grad_norm": 0.008448651060461998, "learning_rate": 1e-06, "loss": -0.0048, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 1673.904296875, "completions/mean_terminated_length": 1528.8343505859375, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "entropy": 0.2969505935907364, "epoch": 3.5528846153846154, "frac_reward_zero_std": 0.5625, "grad_norm": 0.011285161599516869, "learning_rate": 1e-06, "loss": -0.0043, "num_tokens": 1311276607.0, "reward": 0.37819382548332214, "reward_std": 0.02540595829486847, "rewards/progression_diversity/mean": -0.0028848343063145876, "rewards/progression_diversity/std": 0.028827061876654625, "rewards/symbolic_reward_accuracy/mean": 0.2421875, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.7763671875, "rewards/symbolic_reward_partial_score/std": 0.18114922940731049, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0641391277313232, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 5.106628894805908, "step": 2217 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29557299613952637, "epoch": 3.5544871794871797, "grad_norm": 3402.72705078125, "learning_rate": 1e-06, "loss": 0.0937, "step": 2218 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2919696867465973, "epoch": 3.5560897435897436, "grad_norm": 0.012350378558039665, "learning_rate": 1e-06, "loss": -0.0099, "step": 2219 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.28622542321681976, "epoch": 3.5576923076923075, "grad_norm": 0.01743849739432335, "learning_rate": 1e-06, "loss": 0.0084, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 1677.265625, "completions/mean_terminated_length": 1532.228759765625, "completions/min_length": 989.0, "completions/min_terminated_length": 989.0, "entropy": 0.3065932095050812, "epoch": 3.559294871794872, "frac_reward_zero_std": 0.46875, "grad_norm": 0.021933453157544136, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 1312952407.0, "reward": 0.42246508598327637, "reward_std": 0.04646175354719162, "rewards/progression_diversity/mean": -0.0020300918258726597, "rewards/progression_diversity/std": 0.02212829887866974, "rewards/symbolic_reward_accuracy/mean": 0.314453125, "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, "rewards/symbolic_reward_partial_score/mean": 0.780029296875, "rewards/symbolic_reward_partial_score/std": 0.20408369600772858, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0639897584915161, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 5.478954792022705, "step": 2221 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3041534572839737, "epoch": 3.560897435897436, "grad_norm": 0.019840234890580177, "learning_rate": 1e-06, "loss": 0.0237, "step": 2222 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.299197256565094, "epoch": 3.5625, "grad_norm": 0.015518907457590103, "learning_rate": 1e-06, "loss": 0.012, "step": 2223 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.29240258038043976, "epoch": 3.564102564102564, "grad_norm": 0.017431948333978653, "learning_rate": 1e-06, "loss": 0.069, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1671.21484375, "completions/mean_terminated_length": 1555.3660888671875, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "entropy": 0.2979786843061447, "epoch": 3.565705128205128, "frac_reward_zero_std": 0.6875, "grad_norm": 362.736572265625, "learning_rate": 1e-06, "loss": 0.0251, "num_tokens": 1314672565.0, "reward": 0.35204920172691345, "reward_std": 0.017848532646894455, "rewards/progression_diversity/mean": -0.002111406996846199, "rewards/progression_diversity/std": 0.024554969742894173, "rewards/symbolic_reward_accuracy/mean": 0.21484375, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.7445312738418579, "rewards/symbolic_reward_partial_score/std": 0.18352799117565155, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0668435096740723, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 4.025793075561523, "step": 2225 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.29960186779499054, "epoch": 3.5673076923076925, "grad_norm": 0.014206199906766415, "learning_rate": 1e-06, "loss": 0.0001, "step": 2226 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.29641352593898773, "epoch": 3.5689102564102564, "grad_norm": 0.013473715633153915, "learning_rate": 1e-06, "loss": 0.0365, "step": 2227 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.29800280928611755, "epoch": 3.5705128205128203, "grad_norm": 0.01775544323027134, "learning_rate": 1e-06, "loss": -0.0016, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 1592.27734375, "completions/mean_terminated_length": 1534.2706298828125, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "entropy": 0.2977886497974396, "epoch": 3.5721153846153846, "frac_reward_zero_std": 0.5625, "grad_norm": 1049.2088623046875, "learning_rate": 1e-06, "loss": 0.0501, "num_tokens": 1316468131.0, "reward": 0.3352065682411194, "reward_std": 0.040253110229969025, "rewards/progression_diversity/mean": -0.000830255274195224, "rewards/progression_diversity/std": 0.015343528240919113, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.7475911378860474, "rewards/symbolic_reward_partial_score/std": 0.184952974319458, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0706126689910889, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 2.3357362747192383, "step": 2229 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2995491325855255, "epoch": 3.573717948717949, "grad_norm": 0.029800059273838997, "learning_rate": 1e-06, "loss": -0.0004, "step": 2230 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.30069583654403687, "epoch": 3.5753205128205128, "grad_norm": 0.02240557037293911, "learning_rate": 1e-06, "loss": -0.0074, "step": 2231 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.29591603577136993, "epoch": 3.5769230769230766, "grad_norm": 0.020445875823497772, "learning_rate": 1e-06, "loss": 0.003, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 1476.59765625, "completions/mean_terminated_length": 1476.59765625, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "entropy": 0.3097764253616333, "epoch": 3.578525641025641, "frac_reward_zero_std": 0.90625, "grad_norm": 0.009613445959985256, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 1318013077.0, "reward": 0.2948632836341858, "reward_std": 0.003713551675900817, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.7328775525093079, "rewards/symbolic_reward_partial_score/std": 0.16866321861743927, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0766642093658447, "sampling/importance_sampling_ratio/min": 5.8294663176639006e-05, "sampling/sampling_logp_difference/max": 9.75, "sampling/sampling_logp_difference/mean": 0.14255455136299133, "step": 2233 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.31367750465869904, "epoch": 3.5801282051282053, "grad_norm": 0.011548890732228756, "learning_rate": 1e-06, "loss": -0.0032, "step": 2234 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.3107854425907135, "epoch": 3.581730769230769, "grad_norm": 0.005681135691702366, "learning_rate": 1e-06, "loss": 0.0013, "step": 2235 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.313510924577713, "epoch": 3.5833333333333335, "grad_norm": 0.0011630782391875982, "learning_rate": 1e-06, "loss": 0.0037, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 1488.98828125, "completions/mean_terminated_length": 1488.98828125, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "entropy": 0.3215675801038742, "epoch": 3.5849358974358974, "frac_reward_zero_std": 0.84375, "grad_norm": 0.008507279679179192, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 1319585743.0, "reward": 0.3924804925918579, "reward_std": 0.01995375007390976, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.259765625, "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, "rewards/symbolic_reward_partial_score/mean": 0.7887369990348816, "rewards/symbolic_reward_partial_score/std": 0.16583316028118134, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078096866607666, "sampling/importance_sampling_ratio/min": 0.0001314402325078845, "sampling/sampling_logp_difference/max": 8.936958312988281, "sampling/sampling_logp_difference/mean": 0.14448994398117065, "step": 2237 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.31759844720363617, "epoch": 3.5865384615384617, "grad_norm": 0.005916442256420851, "learning_rate": 1e-06, "loss": 0.0009, "step": 2238 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.31728270649909973, "epoch": 3.5881410256410255, "grad_norm": 0.01380041241645813, "learning_rate": 1e-06, "loss": -0.0004, "step": 2239 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.3198378384113312, "epoch": 3.58974358974359, "grad_norm": 0.011921794153749943, "learning_rate": 1e-06, "loss": -0.0002, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 1711.978515625, "completions/mean_terminated_length": 1508.60400390625, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "entropy": 0.29430273175239563, "epoch": 3.5913461538461537, "frac_reward_zero_std": 0.46875, "grad_norm": 607.2608032226562, "learning_rate": 1e-06, "loss": 0.0939, "num_tokens": 1321497412.0, "reward": 0.3224715292453766, "reward_std": 0.018792927265167236, "rewards/progression_diversity/mean": -0.0033359499648213387, "rewards/progression_diversity/std": 0.029082374647259712, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.7052246332168579, "rewards/symbolic_reward_partial_score/std": 0.20231178402900696, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0549734830856323, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 11.582987785339355, "step": 2241 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3087223768234253, "epoch": 3.592948717948718, "grad_norm": 0.012669588439166546, "learning_rate": 1e-06, "loss": -0.0031, "step": 2242 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3084537237882614, "epoch": 3.594551282051282, "grad_norm": 0.02636776864528656, "learning_rate": 1e-06, "loss": 0.0134, "step": 2243 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3076443672180176, "epoch": 3.5961538461538463, "grad_norm": 94.45758056640625, "learning_rate": 1e-06, "loss": 0.0224, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3366.0, "completions/mean_length": 1752.544921875, "completions/mean_terminated_length": 1549.732666015625, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "entropy": 0.31993138790130615, "epoch": 3.59775641025641, "frac_reward_zero_std": 0.59375, "grad_norm": 107.91487121582031, "learning_rate": 1e-06, "loss": 0.0304, "num_tokens": 1323181931.0, "reward": 0.44030338525772095, "reward_std": 0.029550693929195404, "rewards/progression_diversity/mean": -0.00335341296158731, "rewards/progression_diversity/std": 0.03010004572570324, "rewards/symbolic_reward_accuracy/mean": 0.3359375, "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, "rewards/symbolic_reward_partial_score/mean": 0.7978678345680237, "rewards/symbolic_reward_partial_score/std": 0.19323518872261047, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059898018836975, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 10.545293807983398, "step": 2245 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3280244767665863, "epoch": 3.5993589743589745, "grad_norm": 0.02449607476592064, "learning_rate": 1e-06, "loss": 0.0009, "step": 2246 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.32079842686653137, "epoch": 3.6009615384615383, "grad_norm": 0.018285367637872696, "learning_rate": 1e-06, "loss": 0.059, "step": 2247 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.32340890169143677, "epoch": 3.6025641025641026, "grad_norm": 0.00783622357994318, "learning_rate": 1e-06, "loss": 0.0208, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 1838.962890625, "completions/mean_terminated_length": 1578.713623046875, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "entropy": 0.3316734880208969, "epoch": 3.6041666666666665, "frac_reward_zero_std": 0.59375, "grad_norm": 0.011173716746270657, "learning_rate": 1e-06, "loss": 0.0217, "num_tokens": 1324944408.0, "reward": 0.37416577339172363, "reward_std": 0.039710745215415955, "rewards/progression_diversity/mean": -0.004324798006564379, "rewards/progression_diversity/std": 0.033225029706954956, "rewards/symbolic_reward_accuracy/mean": 0.240234375, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.7694987058639526, "rewards/symbolic_reward_partial_score/std": 0.2098463922739029, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0569475889205933, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 12.69996452331543, "step": 2249 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31876716017723083, "epoch": 3.605769230769231, "grad_norm": 0.010406982153654099, "learning_rate": 1e-06, "loss": 0.0613, "step": 2250 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.322706013917923, "epoch": 3.6073717948717947, "grad_norm": 8.13254451751709, "learning_rate": 1e-06, "loss": 0.0253, "step": 2251 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.32810820639133453, "epoch": 3.608974358974359, "grad_norm": 0.506855845451355, "learning_rate": 1e-06, "loss": 0.0166, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 1849.2734375, "completions/mean_terminated_length": 1589.2086181640625, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "entropy": 0.3225103169679642, "epoch": 3.6105769230769234, "frac_reward_zero_std": 0.5, "grad_norm": 0.011867878027260303, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 1326873332.0, "reward": 0.24351388216018677, "reward_std": 0.017110900953412056, "rewards/progression_diversity/mean": -0.004082350060343742, "rewards/progression_diversity/std": 0.03328991308808327, "rewards/symbolic_reward_accuracy/mean": 0.0625, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.689453125, "rewards/symbolic_reward_partial_score/std": 0.1663842499256134, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0547860860824585, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 12.677425384521484, "step": 2253 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.31891578435897827, "epoch": 3.6121794871794872, "grad_norm": 250185.734375, "learning_rate": 1e-06, "loss": 26.996, "step": 2254 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.32373523712158203, "epoch": 3.613782051282051, "grad_norm": 46074924.0, "learning_rate": 1e-06, "loss": 2780.4724, "step": 2255 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.32337436079978943, "epoch": 3.6153846153846154, "grad_norm": 0.01320252288132906, "learning_rate": 1e-06, "loss": 2.3287, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2528.0, "completions/mean_length": 1806.265625, "completions/mean_terminated_length": 1604.1981201171875, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.3251884877681732, "epoch": 3.6169871794871797, "frac_reward_zero_std": 0.6875, "grad_norm": 670.6405029296875, "learning_rate": 1e-06, "loss": 0.0407, "num_tokens": 1328659420.0, "reward": 0.4107644557952881, "reward_std": 0.033720072358846664, "rewards/progression_diversity/mean": -0.003146503819152713, "rewards/progression_diversity/std": 0.02718130685389042, "rewards/symbolic_reward_accuracy/mean": 0.28125, "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, "rewards/symbolic_reward_partial_score/mean": 0.8081217408180237, "rewards/symbolic_reward_partial_score/std": 0.1699947714805603, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.063145637512207, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 9.102170944213867, "step": 2257 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3352958559989929, "epoch": 3.6185897435897436, "grad_norm": 0.013571917079389095, "learning_rate": 1e-06, "loss": -0.009, "step": 2258 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3343977779150009, "epoch": 3.6201923076923075, "grad_norm": 0.007707657292485237, "learning_rate": 1e-06, "loss": 0.0254, "step": 2259 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3308611214160919, "epoch": 3.621794871794872, "grad_norm": 0.008745530620217323, "learning_rate": 1e-06, "loss": 0.0577, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 1843.185546875, "completions/mean_terminated_length": 1612.3790283203125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "entropy": 0.32520605623722076, "epoch": 3.623397435897436, "frac_reward_zero_std": 0.34375, "grad_norm": 302.1920471191406, "learning_rate": 1e-06, "loss": 0.0245, "num_tokens": 1330441019.0, "reward": 0.3576836585998535, "reward_std": 0.02488100342452526, "rewards/progression_diversity/mean": -0.00360656064003706, "rewards/progression_diversity/std": 0.02981680817902088, "rewards/symbolic_reward_accuracy/mean": 0.216796875, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.7601073980331421, "rewards/symbolic_reward_partial_score/std": 0.19392609596252441, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0611423254013062, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 10.63503646850586, "step": 2261 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3336937874555588, "epoch": 3.625, "grad_norm": 0.02153988927602768, "learning_rate": 1e-06, "loss": 0.0082, "step": 2262 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.33279934525489807, "epoch": 3.626602564102564, "grad_norm": 0.03544102981686592, "learning_rate": 1e-06, "loss": -0.0076, "step": 2263 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.32690323889255524, "epoch": 3.628205128205128, "grad_norm": 0.018338177353143692, "learning_rate": 1e-06, "loss": 0.0575, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3015.0, "completions/mean_length": 1828.125, "completions/mean_terminated_length": 1626.3604736328125, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "entropy": 0.32338719069957733, "epoch": 3.6298076923076925, "frac_reward_zero_std": 0.46875, "grad_norm": 316.7826232910156, "learning_rate": 1e-06, "loss": 0.0849, "num_tokens": 1332244379.0, "reward": 0.2881593108177185, "reward_std": 0.0218312069773674, "rewards/progression_diversity/mean": -0.0034066529478877783, "rewards/progression_diversity/std": 0.03087565302848816, "rewards/symbolic_reward_accuracy/mean": 0.125, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.7152017951011658, "rewards/symbolic_reward_partial_score/std": 0.19993343949317932, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0661569833755493, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 8.5504789352417, "step": 2265 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.33843283355236053, "epoch": 3.6314102564102564, "grad_norm": 0.01766320690512657, "learning_rate": 1e-06, "loss": 0.0185, "step": 2266 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.332566499710083, "epoch": 3.6330128205128203, "grad_norm": 0.01683247648179531, "learning_rate": 1e-06, "loss": 0.3716, "step": 2267 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.334817498922348, "epoch": 3.6346153846153846, "grad_norm": 0.011439353227615356, "learning_rate": 1e-06, "loss": 0.0074, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3406.0, "completions/mean_length": 1817.625, "completions/mean_terminated_length": 1644.9012451171875, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "entropy": 0.3378443419933319, "epoch": 3.636217948717949, "frac_reward_zero_std": 0.40625, "grad_norm": 1454.8203125, "learning_rate": 1e-06, "loss": 0.0475, "num_tokens": 1334038011.0, "reward": 0.3225540518760681, "reward_std": 0.03539721295237541, "rewards/progression_diversity/mean": -0.0028966807294636965, "rewards/progression_diversity/std": 0.027032606303691864, "rewards/symbolic_reward_accuracy/mean": 0.181640625, "rewards/symbolic_reward_accuracy/std": 0.38592514395713806, "rewards/symbolic_reward_partial_score/mean": 0.7139486074447632, "rewards/symbolic_reward_partial_score/std": 0.20663875341415405, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0668741464614868, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 8.729013442993164, "step": 2269 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.32664966583251953, "epoch": 3.6378205128205128, "grad_norm": 0.023835686966776848, "learning_rate": 1e-06, "loss": 0.0819, "step": 2270 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3433900773525238, "epoch": 3.6394230769230766, "grad_norm": 0.015054881572723389, "learning_rate": 1e-06, "loss": -0.0076, "step": 2271 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3428369015455246, "epoch": 3.641025641025641, "grad_norm": 0.014643248170614243, "learning_rate": 1e-06, "loss": -0.0061, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 2124.025390625, "completions/mean_terminated_length": 1664.026123046875, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "entropy": 0.32794930040836334, "epoch": 3.6426282051282053, "frac_reward_zero_std": 0.4375, "grad_norm": 337.78216552734375, "learning_rate": 1e-06, "loss": 0.064, "num_tokens": 1335932760.0, "reward": 0.40950584411621094, "reward_std": 0.062236420810222626, "rewards/progression_diversity/mean": -0.006936722435057163, "rewards/progression_diversity/std": 0.04011997580528259, "rewards/symbolic_reward_accuracy/mean": 0.302734375, "rewards/symbolic_reward_accuracy/std": 0.45989060401916504, "rewards/symbolic_reward_partial_score/mean": 0.7682454586029053, "rewards/symbolic_reward_partial_score/std": 0.2247115969657898, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0450644493103027, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 19.491121292114258, "step": 2273 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3140611946582794, "epoch": 3.644230769230769, "grad_norm": 11.444162368774414, "learning_rate": 1e-06, "loss": 0.0813, "step": 2274 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3365636169910431, "epoch": 3.6458333333333335, "grad_norm": 0.028920264914631844, "learning_rate": 1e-06, "loss": 0.0278, "step": 2275 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3288937658071518, "epoch": 3.6474358974358974, "grad_norm": 0.01110448595136404, "learning_rate": 1e-06, "loss": 0.0625, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 2268.736328125, "completions/mean_terminated_length": 1694.945068359375, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "entropy": 0.3222566395998001, "epoch": 3.6490384615384617, "frac_reward_zero_std": 0.46875, "grad_norm": 1264.3094482421875, "learning_rate": 1e-06, "loss": 0.043, "num_tokens": 1337987217.0, "reward": 0.5130249261856079, "reward_std": 0.08471733331680298, "rewards/progression_diversity/mean": -0.01001398079097271, "rewards/progression_diversity/std": 0.0511719211935997, "rewards/symbolic_reward_accuracy/mean": 0.4453125, "rewards/symbolic_reward_accuracy/std": 0.49748632311820984, "rewards/symbolic_reward_partial_score/mean": 0.8282551765441895, "rewards/symbolic_reward_partial_score/std": 0.22973352670669556, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0400309562683105, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 21.84160614013672, "step": 2277 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3258931040763855, "epoch": 3.6506410256410255, "grad_norm": 0.021348005160689354, "learning_rate": 1e-06, "loss": 0.0619, "step": 2278 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31560730934143066, "epoch": 3.65224358974359, "grad_norm": 0.010525341145694256, "learning_rate": 1e-06, "loss": 0.0952, "step": 2279 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3183016777038574, "epoch": 3.6538461538461537, "grad_norm": 0.01716870814561844, "learning_rate": 1e-06, "loss": 0.0683, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 1967.486328125, "completions/mean_terminated_length": 1650.9560546875, "completions/min_length": 1010.0, "completions/min_terminated_length": 1010.0, "entropy": 0.3132941722869873, "epoch": 3.655448717948718, "frac_reward_zero_std": 0.3125, "grad_norm": 1842.542236328125, "learning_rate": 1e-06, "loss": 0.1164, "num_tokens": 1339971770.0, "reward": 0.2741461396217346, "reward_std": 0.0371134914457798, "rewards/progression_diversity/mean": -0.006771244574338198, "rewards/progression_diversity/std": 0.048090219497680664, "rewards/symbolic_reward_accuracy/mean": 0.123046875, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.6731607913970947, "rewards/symbolic_reward_partial_score/std": 0.20429718494415283, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0507025718688965, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 15.552701950073242, "step": 2281 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3280630111694336, "epoch": 3.657051282051282, "grad_norm": 0.013712570071220398, "learning_rate": 1e-06, "loss": 0.0099, "step": 2282 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3313450366258621, "epoch": 3.6586538461538463, "grad_norm": 0.0131281279027462, "learning_rate": 1e-06, "loss": 0.0101, "step": 2283 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3225076198577881, "epoch": 3.66025641025641, "grad_norm": 0.02563142031431198, "learning_rate": 1e-06, "loss": 0.0724, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 1943.8203125, "completions/mean_terminated_length": 1656.1673583984375, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "entropy": 0.31988005340099335, "epoch": 3.6618589743589745, "frac_reward_zero_std": 0.28125, "grad_norm": 1075.149658203125, "learning_rate": 1e-06, "loss": 0.0841, "num_tokens": 1341844046.0, "reward": 0.3518640398979187, "reward_std": 0.06046595424413681, "rewards/progression_diversity/mean": -0.005980104673653841, "rewards/progression_diversity/std": 0.04314341023564339, "rewards/symbolic_reward_accuracy/mean": 0.23046875, "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, "rewards/symbolic_reward_partial_score/mean": 0.7140950560569763, "rewards/symbolic_reward_partial_score/std": 0.21391168236732483, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0547668933868408, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 14.504049301147461, "step": 2285 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.33179762959480286, "epoch": 3.6634615384615383, "grad_norm": 0.028521860018372536, "learning_rate": 1e-06, "loss": 0.0208, "step": 2286 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.324516236782074, "epoch": 3.6650641025641026, "grad_norm": 0.011499549262225628, "learning_rate": 1e-06, "loss": 0.0164, "step": 2287 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3264227360486984, "epoch": 3.6666666666666665, "grad_norm": 1.3879690170288086, "learning_rate": 1e-06, "loss": 0.0003, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 1695.943359375, "completions/mean_terminated_length": 1609.373291015625, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "entropy": 0.336402028799057, "epoch": 3.668269230769231, "frac_reward_zero_std": 0.5625, "grad_norm": 696.1746826171875, "learning_rate": 1e-06, "loss": 0.0194, "num_tokens": 1343498033.0, "reward": 0.3187481164932251, "reward_std": 0.019451720640063286, "rewards/progression_diversity/mean": -0.0016582165844738483, "rewards/progression_diversity/std": 0.02263886295258999, "rewards/symbolic_reward_accuracy/mean": 0.150390625, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.7617676258087158, "rewards/symbolic_reward_partial_score/std": 0.14879107475280762, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07216477394104, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 5.351583480834961, "step": 2289 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.334991991519928, "epoch": 3.6698717948717947, "grad_norm": 0.016783706843852997, "learning_rate": 1e-06, "loss": 0.0124, "step": 2290 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3304772675037384, "epoch": 3.671474358974359, "grad_norm": 0.022302983328700066, "learning_rate": 1e-06, "loss": 0.0245, "step": 2291 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3341434746980667, "epoch": 3.6730769230769234, "grad_norm": 0.012253678403794765, "learning_rate": 1e-06, "loss": 0.0019, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3012.0, "completions/mean_length": 1679.7109375, "completions/mean_terminated_length": 1593.0452880859375, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "entropy": 0.32631611824035645, "epoch": 3.6746794871794872, "frac_reward_zero_std": 0.5625, "grad_norm": 701.1774291992188, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 1345151549.0, "reward": 0.3470765948295593, "reward_std": 0.03532809019088745, "rewards/progression_diversity/mean": -0.0018145160283893347, "rewards/progression_diversity/std": 0.023674938827753067, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.737060546875, "rewards/symbolic_reward_partial_score/std": 0.2012006640434265, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0714975595474243, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 5.101583480834961, "step": 2293 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.33157244324684143, "epoch": 3.676282051282051, "grad_norm": 0.024722039699554443, "learning_rate": 1e-06, "loss": 0.0206, "step": 2294 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3273026645183563, "epoch": 3.6778846153846154, "grad_norm": 0.015624015592038631, "learning_rate": 1e-06, "loss": 0.0245, "step": 2295 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.33001944422721863, "epoch": 3.6794871794871797, "grad_norm": 0.014608138240873814, "learning_rate": 1e-06, "loss": 0.0007, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2430.0, "completions/mean_length": 1686.3046875, "completions/mean_terminated_length": 1541.3570556640625, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "entropy": 0.33564648032188416, "epoch": 3.6810897435897436, "frac_reward_zero_std": 0.59375, "grad_norm": 0.027066614478826523, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 1346826425.0, "reward": 0.43666860461235046, "reward_std": 0.019992386922240257, "rewards/progression_diversity/mean": -0.0030630123801529408, "rewards/progression_diversity/std": 0.031911518424749374, "rewards/symbolic_reward_accuracy/mean": 0.341796875, "rewards/symbolic_reward_accuracy/std": 0.4747757613658905, "rewards/symbolic_reward_partial_score/mean": 0.7746745347976685, "rewards/symbolic_reward_partial_score/std": 0.22795653343200684, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0644441843032837, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 8.418769836425781, "step": 2297 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3229387104511261, "epoch": 3.6826923076923075, "grad_norm": 9548.986328125, "learning_rate": 1e-06, "loss": 0.9497, "step": 2298 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.31784258782863617, "epoch": 3.684294871794872, "grad_norm": 0.017409533262252808, "learning_rate": 1e-06, "loss": 0.0463, "step": 2299 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.32778775691986084, "epoch": 3.685897435897436, "grad_norm": 0.01562940888106823, "learning_rate": 1e-06, "loss": -0.0083, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 1658.37109375, "completions/mean_terminated_length": 1542.4212646484375, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "entropy": 0.3309648334980011, "epoch": 3.6875, "frac_reward_zero_std": 0.53125, "grad_norm": 0.03380941227078438, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 1348522775.0, "reward": 0.25678491592407227, "reward_std": 0.02014780044555664, "rewards/progression_diversity/mean": -0.001687450218014419, "rewards/progression_diversity/std": 0.02322215586900711, "rewards/symbolic_reward_accuracy/mean": 0.087890625, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.6808756589889526, "rewards/symbolic_reward_partial_score/std": 0.177845299243927, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0702204704284668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 5.331143379211426, "step": 2301 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.32819345593452454, "epoch": 3.689102564102564, "grad_norm": 297679.28125, "learning_rate": 1e-06, "loss": 8.1288, "step": 2302 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3345276117324829, "epoch": 3.690705128205128, "grad_norm": 0.007232366129755974, "learning_rate": 1e-06, "loss": 0.0004, "step": 2303 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.32529085874557495, "epoch": 3.6923076923076925, "grad_norm": 31849.216796875, "learning_rate": 1e-06, "loss": 4.3377, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2630.0, "completions/mean_length": 1738.244140625, "completions/mean_terminated_length": 1564.5791015625, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "entropy": 0.3245372474193573, "epoch": 3.6939102564102564, "frac_reward_zero_std": 0.5625, "grad_norm": 0.024981984868645668, "learning_rate": 1e-06, "loss": 0.0394, "num_tokens": 1350288916.0, "reward": 0.2519104480743408, "reward_std": 0.015460798516869545, "rewards/progression_diversity/mean": -0.002804999705404043, "rewards/progression_diversity/std": 0.027330690994858742, "rewards/symbolic_reward_accuracy/mean": 0.0625, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.7173991203308105, "rewards/symbolic_reward_partial_score/std": 0.16349852085113525, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.06309175491333, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 9.228836059570312, "step": 2305 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.32626084983348846, "epoch": 3.6955128205128203, "grad_norm": 1247.531494140625, "learning_rate": 1e-06, "loss": 0.0508, "step": 2306 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3255147635936737, "epoch": 3.6971153846153846, "grad_norm": 0.009068300016224384, "learning_rate": 1e-06, "loss": 0.5594, "step": 2307 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.32918526232242584, "epoch": 3.698717948717949, "grad_norm": 0.017367210239171982, "learning_rate": 1e-06, "loss": -0.0038, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3004.0, "completions/mean_length": 1774.068359375, "completions/mean_terminated_length": 1600.828125, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "entropy": 0.3189825713634491, "epoch": 3.7003205128205128, "frac_reward_zero_std": 0.4375, "grad_norm": 0.019421013072133064, "learning_rate": 1e-06, "loss": 0.0116, "num_tokens": 1352175175.0, "reward": 0.33472028374671936, "reward_std": 0.026918543502688408, "rewards/progression_diversity/mean": -0.0030723384115844965, "rewards/progression_diversity/std": 0.029028048738837242, "rewards/symbolic_reward_accuracy/mean": 0.208984375, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.6978678703308105, "rewards/symbolic_reward_partial_score/std": 0.19654740393161774, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0647794008255005, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 8.113250732421875, "step": 2309 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.32003454864025116, "epoch": 3.7019230769230766, "grad_norm": 0.031067878007888794, "learning_rate": 1e-06, "loss": 0.4607, "step": 2310 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.32604101300239563, "epoch": 3.703525641025641, "grad_norm": 7065.14501953125, "learning_rate": 1e-06, "loss": 0.5045, "step": 2311 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.32465484738349915, "epoch": 3.7051282051282053, "grad_norm": 0.008808803744614124, "learning_rate": 1e-06, "loss": 0.004, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2823.0, "completions/mean_length": 1703.431640625, "completions/mean_terminated_length": 1645.86083984375, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "entropy": 0.33033812046051025, "epoch": 3.706730769230769, "frac_reward_zero_std": 0.65625, "grad_norm": 0.029636235907673836, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 1353921444.0, "reward": 0.3808506727218628, "reward_std": 0.02052554301917553, "rewards/progression_diversity/mean": -0.0008730281260795891, "rewards/progression_diversity/std": 0.01423015259206295, "rewards/symbolic_reward_accuracy/mean": 0.263671875, "rewards/symbolic_reward_accuracy/std": 0.4410543739795685, "rewards/symbolic_reward_partial_score/mean": 0.7421875, "rewards/symbolic_reward_partial_score/std": 0.20971913635730743, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0755610466003418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 2.919158458709717, "step": 2313 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3263317346572876, "epoch": 3.7083333333333335, "grad_norm": 0.009872586466372013, "learning_rate": 1e-06, "loss": 0.049, "step": 2314 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3279000520706177, "epoch": 3.7099358974358974, "grad_norm": 0.025471488013863564, "learning_rate": 1e-06, "loss": 0.0166, "step": 2315 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3354235142469406, "epoch": 3.7115384615384617, "grad_norm": 0.012011319398880005, "learning_rate": 1e-06, "loss": 0.0027, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 2033.783203125, "completions/mean_terminated_length": 1747.92236328125, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "entropy": 0.31582309305667877, "epoch": 3.7131410256410255, "frac_reward_zero_std": 0.46875, "grad_norm": 145.75550842285156, "learning_rate": 1e-06, "loss": 0.0422, "num_tokens": 1355863077.0, "reward": 0.37086308002471924, "reward_std": 0.03677962347865105, "rewards/progression_diversity/mean": -0.004513155668973923, "rewards/progression_diversity/std": 0.03423220291733742, "rewards/symbolic_reward_accuracy/mean": 0.244140625, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.7519856691360474, "rewards/symbolic_reward_partial_score/std": 0.19709475338459015, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0543160438537598, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 12.873621940612793, "step": 2317 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3110759109258652, "epoch": 3.71474358974359, "grad_norm": 2783.682861328125, "learning_rate": 1e-06, "loss": 1.9094, "step": 2318 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31409376859664917, "epoch": 3.7163461538461537, "grad_norm": 5945.8662109375, "learning_rate": 1e-06, "loss": 0.319, "step": 2319 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.32960157096385956, "epoch": 3.717948717948718, "grad_norm": 0.012837960384786129, "learning_rate": 1e-06, "loss": -0.0077, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3660.0, "completions/mean_length": 2013.263671875, "completions/mean_terminated_length": 1814.0654296875, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "entropy": 0.32310742139816284, "epoch": 3.719551282051282, "frac_reward_zero_std": 0.5625, "grad_norm": 137.2488250732422, "learning_rate": 1e-06, "loss": 0.0302, "num_tokens": 1357745916.0, "reward": 0.28083521127700806, "reward_std": 0.030796613544225693, "rewards/progression_diversity/mean": -0.002905955072492361, "rewards/progression_diversity/std": 0.02647358365356922, "rewards/symbolic_reward_accuracy/mean": 0.111328125, "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, "rewards/symbolic_reward_partial_score/mean": 0.7168131470680237, "rewards/symbolic_reward_partial_score/std": 0.1985962688922882, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0641801357269287, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 7.660578727722168, "step": 2321 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.31349869072437286, "epoch": 3.7211538461538463, "grad_norm": 15555.6708984375, "learning_rate": 1e-06, "loss": 6.2007, "step": 2322 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3223974406719208, "epoch": 3.72275641025641, "grad_norm": 968024.4375, "learning_rate": 1e-06, "loss": 23.6673, "step": 2323 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.31636857986450195, "epoch": 3.7243589743589745, "grad_norm": 0.015157344751060009, "learning_rate": 1e-06, "loss": 0.0284, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 2236.482421875, "completions/mean_terminated_length": 1838.760986328125, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "entropy": 0.3094794303178787, "epoch": 3.7259615384615383, "frac_reward_zero_std": 0.34375, "grad_norm": 0.025273853912949562, "learning_rate": 1e-06, "loss": 0.0773, "num_tokens": 1359728803.0, "reward": 0.36245208978652954, "reward_std": 0.04594516381621361, "rewards/progression_diversity/mean": -0.005767707247287035, "rewards/progression_diversity/std": 0.036357343196868896, "rewards/symbolic_reward_accuracy/mean": 0.2421875, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.7291991710662842, "rewards/symbolic_reward_partial_score/std": 0.22089123725891113, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.050485372543335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 14.873160362243652, "step": 2325 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.31531573832035065, "epoch": 3.7275641025641026, "grad_norm": 0.019093314185738564, "learning_rate": 1e-06, "loss": 0.0435, "step": 2326 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3113931119441986, "epoch": 3.7291666666666665, "grad_norm": 0.015845902264118195, "learning_rate": 1e-06, "loss": 0.0502, "step": 2327 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3193650245666504, "epoch": 3.730769230769231, "grad_norm": 0.1452263742685318, "learning_rate": 1e-06, "loss": 0.0405, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3291.0, "completions/mean_length": 2277.3984375, "completions/mean_terminated_length": 1880.8272705078125, "completions/min_length": 1066.0, "completions/min_terminated_length": 1066.0, "entropy": 0.32257337868213654, "epoch": 3.7323717948717947, "frac_reward_zero_std": 0.375, "grad_norm": 67.19734954833984, "learning_rate": 1e-06, "loss": -0.0058, "num_tokens": 1361753231.0, "reward": 0.4234715700149536, "reward_std": 0.052917372435331345, "rewards/progression_diversity/mean": -0.00587063655257225, "rewards/progression_diversity/std": 0.03647465631365776, "rewards/symbolic_reward_accuracy/mean": 0.328125, "rewards/symbolic_reward_accuracy/std": 0.4699897766113281, "rewards/symbolic_reward_partial_score/mean": 0.7568196654319763, "rewards/symbolic_reward_partial_score/std": 0.2349434494972229, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.053420066833496, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 13.831912994384766, "step": 2329 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3165089935064316, "epoch": 3.733974358974359, "grad_norm": 1270778.375, "learning_rate": 1e-06, "loss": 109.9686, "step": 2330 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31606537103652954, "epoch": 3.7355769230769234, "grad_norm": 241028.328125, "learning_rate": 1e-06, "loss": 29.3297, "step": 2331 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.31982485949993134, "epoch": 3.7371794871794872, "grad_norm": 5490.81640625, "learning_rate": 1e-06, "loss": 1.0961, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 2758.09375, "completions/mean_terminated_length": 1999.5382080078125, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "entropy": 0.3020293414592743, "epoch": 3.738782051282051, "frac_reward_zero_std": 0.28125, "grad_norm": 2158.609375, "learning_rate": 1e-06, "loss": 0.0567, "num_tokens": 1364147151.0, "reward": 0.28579181432724, "reward_std": 0.057038046419620514, "rewards/progression_diversity/mean": -0.01163883414119482, "rewards/progression_diversity/std": 0.052136827260255814, "rewards/symbolic_reward_accuracy/mean": 0.140625, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.6769856810569763, "rewards/symbolic_reward_partial_score/std": 0.21585947275161743, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0365989208221436, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 21.644445419311523, "step": 2333 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.29430215060710907, "epoch": 3.7403846153846154, "grad_norm": 2930266.0, "learning_rate": 1e-06, "loss": 34.3929, "step": 2334 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3125629723072052, "epoch": 3.7419871794871797, "grad_norm": 533435.4375, "learning_rate": 1e-06, "loss": 14.3577, "step": 2335 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30381515622138977, "epoch": 3.7435897435897436, "grad_norm": 0.9284766912460327, "learning_rate": 1e-06, "loss": 0.0779, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3698.0, "completions/mean_length": 2933.576171875, "completions/mean_terminated_length": 1976.8514404296875, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "entropy": 0.28462617099285126, "epoch": 3.7451923076923075, "frac_reward_zero_std": 0.15625, "grad_norm": 4198.36865234375, "learning_rate": 1e-06, "loss": 0.1291, "num_tokens": 1366628678.0, "reward": 0.24286873638629913, "reward_std": 0.05714884027838707, "rewards/progression_diversity/mean": -0.014884944073855877, "rewards/progression_diversity/std": 0.05795244127511978, "rewards/symbolic_reward_accuracy/mean": 0.080078125, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.6570637822151184, "rewards/symbolic_reward_partial_score/std": 0.2119654417037964, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0245647430419922, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 27.924781799316406, "step": 2337 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3178318291902542, "epoch": 3.746794871794872, "grad_norm": 0.02091590128839016, "learning_rate": 1e-06, "loss": 0.039, "step": 2338 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.29258735477924347, "epoch": 3.748397435897436, "grad_norm": 0.03195264935493469, "learning_rate": 1e-06, "loss": 0.0798, "step": 2339 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3096400648355484, "epoch": 3.75, "grad_norm": 0.027371717616915703, "learning_rate": 1e-06, "loss": 0.0889, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 2866.087890625, "completions/mean_terminated_length": 1994.87109375, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "entropy": 0.3162682503461838, "epoch": 3.751602564102564, "frac_reward_zero_std": 0.125, "grad_norm": 2558.345458984375, "learning_rate": 1e-06, "loss": 0.0613, "num_tokens": 1368975267.0, "reward": 0.2645840644836426, "reward_std": 0.05693788453936577, "rewards/progression_diversity/mean": -0.013275885954499245, "rewards/progression_diversity/std": 0.05504615232348442, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.6545247435569763, "rewards/symbolic_reward_partial_score/std": 0.22332006692886353, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.033745288848877, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 23.267847061157227, "step": 2341 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3221198171377182, "epoch": 3.753205128205128, "grad_norm": 0.016970986500382423, "learning_rate": 1e-06, "loss": 0.0128, "step": 2342 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2914904057979584, "epoch": 3.7548076923076925, "grad_norm": 0.016527332365512848, "learning_rate": 1e-06, "loss": 0.1016, "step": 2343 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.29611217975616455, "epoch": 3.7564102564102564, "grad_norm": 0.015034251846373081, "learning_rate": 1e-06, "loss": 0.1444, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3699.0, "completions/mean_length": 2942.474609375, "completions/mean_terminated_length": 1956.1990966796875, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "entropy": 0.324889600276947, "epoch": 3.7580128205128203, "frac_reward_zero_std": 0.21875, "grad_norm": 564.7570190429688, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 1371389206.0, "reward": 0.4041377902030945, "reward_std": 0.0859639048576355, "rewards/progression_diversity/mean": -0.01590690203011036, "rewards/progression_diversity/std": 0.06111828610301018, "rewards/symbolic_reward_accuracy/mean": 0.314453125, "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, "rewards/symbolic_reward_partial_score/mean": 0.7259114384651184, "rewards/symbolic_reward_partial_score/std": 0.2665213644504547, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0233384370803833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 29.020061492919922, "step": 2345 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2806452810764313, "epoch": 3.7596153846153846, "grad_norm": 0.017962336540222168, "learning_rate": 1e-06, "loss": 0.1727, "step": 2346 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3103862851858139, "epoch": 3.761217948717949, "grad_norm": 0.013579954393208027, "learning_rate": 1e-06, "loss": 0.0811, "step": 2347 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.305005207657814, "epoch": 3.7628205128205128, "grad_norm": 0.01808071881532669, "learning_rate": 1e-06, "loss": 0.1148, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3296.0, "completions/mean_length": 2663.65234375, "completions/mean_terminated_length": 1959.322509765625, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "entropy": 0.33493170142173767, "epoch": 3.7644230769230766, "frac_reward_zero_std": 0.28125, "grad_norm": 88.44670867919922, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 1373608292.0, "reward": 0.3504989743232727, "reward_std": 0.0687357485294342, "rewards/progression_diversity/mean": -0.013580359518527985, "rewards/progression_diversity/std": 0.062451787292957306, "rewards/symbolic_reward_accuracy/mean": 0.22265625, "rewards/symbolic_reward_accuracy/std": 0.41643625497817993, "rewards/symbolic_reward_partial_score/mean": 0.7293294668197632, "rewards/symbolic_reward_partial_score/std": 0.23006510734558105, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0389269590377808, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 22.056781768798828, "step": 2349 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3003799021244049, "epoch": 3.766025641025641, "grad_norm": 0.014394021593034267, "learning_rate": 1e-06, "loss": 0.1444, "step": 2350 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3186776787042618, "epoch": 3.7676282051282053, "grad_norm": 0.02413056790828705, "learning_rate": 1e-06, "loss": 0.0553, "step": 2351 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.32591530680656433, "epoch": 3.769230769230769, "grad_norm": 0.01364830881357193, "learning_rate": 1e-06, "loss": 0.0332, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3292.0, "completions/mean_length": 3137.02734375, "completions/mean_terminated_length": 1953.2552490234375, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "entropy": 0.30336521565914154, "epoch": 3.7708333333333335, "frac_reward_zero_std": 0.21875, "grad_norm": 689.3433227539062, "learning_rate": 1e-06, "loss": 0.0575, "num_tokens": 1376180162.0, "reward": 0.30668240785598755, "reward_std": 0.07010701298713684, "rewards/progression_diversity/mean": -0.021703477948904037, "rewards/progression_diversity/std": 0.07450754940509796, "rewards/symbolic_reward_accuracy/mean": 0.171875, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.6844563484191895, "rewards/symbolic_reward_partial_score/std": 0.23659244179725647, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0194364786148071, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 32.71318054199219, "step": 2353 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3008395582437515, "epoch": 3.7724358974358974, "grad_norm": 0.02489347755908966, "learning_rate": 1e-06, "loss": 0.0919, "step": 2354 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3014819622039795, "epoch": 3.7740384615384617, "grad_norm": 0.018208302557468414, "learning_rate": 1e-06, "loss": 0.1138, "step": 2355 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30451202392578125, "epoch": 3.7756410256410255, "grad_norm": 0.03824426978826523, "learning_rate": 1e-06, "loss": 0.049, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3241.0, "completions/mean_length": 2671.005859375, "completions/mean_terminated_length": 1937.3887939453125, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "entropy": 0.32287219166755676, "epoch": 3.77724358974359, "frac_reward_zero_std": 0.34375, "grad_norm": 901.6248779296875, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 1378516597.0, "reward": 0.2821524143218994, "reward_std": 0.06520838290452957, "rewards/progression_diversity/mean": -0.015226154588162899, "rewards/progression_diversity/std": 0.06770697236061096, "rewards/symbolic_reward_accuracy/mean": 0.126953125, "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, "rewards/symbolic_reward_partial_score/mean": 0.6903645992279053, "rewards/symbolic_reward_partial_score/std": 0.22543098032474518, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0395439863204956, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 22.674461364746094, "step": 2357 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30759990215301514, "epoch": 3.7788461538461537, "grad_norm": 0.5585769414901733, "learning_rate": 1e-06, "loss": 0.0984, "step": 2358 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3166644871234894, "epoch": 3.780448717948718, "grad_norm": 0.1928190290927887, "learning_rate": 1e-06, "loss": 0.0079, "step": 2359 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3207753002643585, "epoch": 3.782051282051282, "grad_norm": 0.02237272635102272, "learning_rate": 1e-06, "loss": 0.0525, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3211.0, "completions/mean_length": 2510.060546875, "completions/mean_terminated_length": 1887.14892578125, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "entropy": 0.3191475421190262, "epoch": 3.7836538461538463, "frac_reward_zero_std": 0.21875, "grad_norm": 950.1478271484375, "learning_rate": 1e-06, "loss": 0.0732, "num_tokens": 1380652708.0, "reward": 0.36586660146713257, "reward_std": 0.09463842213153839, "rewards/progression_diversity/mean": -0.011485631577670574, "rewards/progression_diversity/std": 0.05633028969168663, "rewards/symbolic_reward_accuracy/mean": 0.2421875, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.7420735359191895, "rewards/symbolic_reward_partial_score/std": 0.243971049785614, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.038116455078125, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 23.570289611816406, "step": 2361 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.33112041652202606, "epoch": 3.78525641025641, "grad_norm": 14.519754409790039, "learning_rate": 1e-06, "loss": 0.061, "step": 2362 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.33028391003608704, "epoch": 3.7868589743589745, "grad_norm": 0.024247992783784866, "learning_rate": 1e-06, "loss": 0.0352, "step": 2363 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3137480616569519, "epoch": 3.7884615384615383, "grad_norm": 0.031128326430916786, "learning_rate": 1e-06, "loss": 0.0846, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2966.0, "completions/mean_length": 2746.02734375, "completions/mean_terminated_length": 1867.07275390625, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "entropy": 0.30356088280677795, "epoch": 3.7900641025641026, "frac_reward_zero_std": 0.1875, "grad_norm": 1132.28857421875, "learning_rate": 1e-06, "loss": 0.1076, "num_tokens": 1382941074.0, "reward": 0.30738329887390137, "reward_std": 0.057456813752651215, "rewards/progression_diversity/mean": -0.017531972378492355, "rewards/progression_diversity/std": 0.07124343514442444, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.6723307371139526, "rewards/symbolic_reward_partial_score/std": 0.23962588608264923, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0223664045333862, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 30.863096237182617, "step": 2365 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.306650310754776, "epoch": 3.7916666666666665, "grad_norm": 0.026560494676232338, "learning_rate": 1e-06, "loss": 0.1009, "step": 2366 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.313573881983757, "epoch": 3.793269230769231, "grad_norm": 0.01731313206255436, "learning_rate": 1e-06, "loss": 0.1042, "step": 2367 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.32431718707084656, "epoch": 3.7948717948717947, "grad_norm": 0.012698481790721416, "learning_rate": 1e-06, "loss": 0.0471, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3138.0, "completions/mean_length": 2853.966796875, "completions/mean_terminated_length": 1891.5794677734375, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "entropy": 0.31773290038108826, "epoch": 3.796474358974359, "frac_reward_zero_std": 0.15625, "grad_norm": 901.328369140625, "learning_rate": 1e-06, "loss": 0.0236, "num_tokens": 1385256705.0, "reward": 0.2686350345611572, "reward_std": 0.06755457073450089, "rewards/progression_diversity/mean": -0.018334418535232544, "rewards/progression_diversity/std": 0.07072144001722336, "rewards/symbolic_reward_accuracy/mean": 0.09765625, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.7098632454872131, "rewards/symbolic_reward_partial_score/std": 0.21099787950515747, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.017049789428711, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 34.733001708984375, "step": 2369 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.33099713921546936, "epoch": 3.7980769230769234, "grad_norm": 0.01458797324448824, "learning_rate": 1e-06, "loss": 0.0647, "step": 2370 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2994411885738373, "epoch": 3.7996794871794872, "grad_norm": 0.03737116977572441, "learning_rate": 1e-06, "loss": 0.1793, "step": 2371 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.32057875394821167, "epoch": 3.801282051282051, "grad_norm": 0.011851564049720764, "learning_rate": 1e-06, "loss": 0.0763, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3275.0, "completions/mean_length": 2494.095703125, "completions/mean_terminated_length": 1870.46728515625, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "entropy": 0.321204349398613, "epoch": 3.8028846153846154, "frac_reward_zero_std": 0.40625, "grad_norm": 464.8273620605469, "learning_rate": 1e-06, "loss": 0.0536, "num_tokens": 1387418002.0, "reward": 0.38406121730804443, "reward_std": 0.06720907241106033, "rewards/progression_diversity/mean": -0.011848336085677147, "rewards/progression_diversity/std": 0.05758029222488403, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.7552083730697632, "rewards/symbolic_reward_partial_score/std": 0.21863536536693573, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.042336344718933, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 22.52627944946289, "step": 2373 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.33628255128860474, "epoch": 3.8044871794871797, "grad_norm": 16.08426856994629, "learning_rate": 1e-06, "loss": 0.047, "step": 2374 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3366074562072754, "epoch": 3.8060897435897436, "grad_norm": 4.6837029457092285, "learning_rate": 1e-06, "loss": 0.0627, "step": 2375 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.32915179431438446, "epoch": 3.8076923076923075, "grad_norm": 0.2614407241344452, "learning_rate": 1e-06, "loss": 0.0403, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3255.0, "completions/mean_length": 2107.373046875, "completions/mean_terminated_length": 1880.760009765625, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "entropy": 0.33755332231521606, "epoch": 3.809294871794872, "frac_reward_zero_std": 0.46875, "grad_norm": 317.5009460449219, "learning_rate": 1e-06, "loss": 0.0604, "num_tokens": 1389344513.0, "reward": 0.2682853937149048, "reward_std": 0.022097572684288025, "rewards/progression_diversity/mean": -0.003493726020678878, "rewards/progression_diversity/std": 0.02860984578728676, "rewards/symbolic_reward_accuracy/mean": 0.091796875, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.712109386920929, "rewards/symbolic_reward_partial_score/std": 0.19135794043540955, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.067301869392395, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 10.378652572631836, "step": 2377 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3546573221683502, "epoch": 3.810897435897436, "grad_norm": 0.024478528648614883, "learning_rate": 1e-06, "loss": 0.2443, "step": 2378 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.36258837580680847, "epoch": 3.8125, "grad_norm": 0.020279182121157646, "learning_rate": 1e-06, "loss": 0.0001, "step": 2379 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3494533598423004, "epoch": 3.814102564102564, "grad_norm": 0.021380124613642693, "learning_rate": 1e-06, "loss": 0.0156, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3135.0, "completions/mean_length": 2129.833984375, "completions/mean_terminated_length": 1874.7891845703125, "completions/min_length": 1129.0, "completions/min_terminated_length": 1129.0, "entropy": 0.33585888147354126, "epoch": 3.815705128205128, "frac_reward_zero_std": 0.5625, "grad_norm": 1741.7381591796875, "learning_rate": 1e-06, "loss": 0.0675, "num_tokens": 1391386876.0, "reward": 0.23786211013793945, "reward_std": 0.01681671291589737, "rewards/progression_diversity/mean": -0.0043185316026210785, "rewards/progression_diversity/std": 0.03434579074382782, "rewards/symbolic_reward_accuracy/mean": 0.0625, "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, "rewards/symbolic_reward_partial_score/mean": 0.6706217527389526, "rewards/symbolic_reward_partial_score/std": 0.17827239632606506, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0659494400024414, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 11.326656341552734, "step": 2381 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.35594816505908966, "epoch": 3.8173076923076925, "grad_norm": 0.012054548598825932, "learning_rate": 1e-06, "loss": 0.001, "step": 2382 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3579705059528351, "epoch": 3.8189102564102564, "grad_norm": 0.025561781600117683, "learning_rate": 1e-06, "loss": 0.0028, "step": 2383 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.33812710642814636, "epoch": 3.8205128205128203, "grad_norm": 0.027250172570347786, "learning_rate": 1e-06, "loss": 0.0297, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3329.0, "completions/mean_length": 2672.564453125, "completions/mean_terminated_length": 1819.1556396484375, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "entropy": 0.3372623771429062, "epoch": 3.8221153846153846, "frac_reward_zero_std": 0.34375, "grad_norm": 106.94781494140625, "learning_rate": 1e-06, "loss": -0.0134, "num_tokens": 1393736781.0, "reward": 0.338218629360199, "reward_std": 0.033895812928676605, "rewards/progression_diversity/mean": -0.015542363747954369, "rewards/progression_diversity/std": 0.06333222985267639, "rewards/symbolic_reward_accuracy/mean": 0.21484375, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.7027831673622131, "rewards/symbolic_reward_partial_score/std": 0.22551730275154114, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0194793939590454, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 35.487525939941406, "step": 2385 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3197958320379257, "epoch": 3.823717948717949, "grad_norm": 3424.5888671875, "learning_rate": 1e-06, "loss": 0.4028, "step": 2386 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3210483342409134, "epoch": 3.8253205128205128, "grad_norm": 0.28248870372772217, "learning_rate": 1e-06, "loss": 0.031, "step": 2387 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.31268228590488434, "epoch": 3.8269230769230766, "grad_norm": 0.01564093679189682, "learning_rate": 1e-06, "loss": 0.1354, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 2821.177734375, "completions/mean_terminated_length": 1764.7052001953125, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "entropy": 0.3302348703145981, "epoch": 3.828525641025641, "frac_reward_zero_std": 0.21875, "grad_norm": 1121.2408447265625, "learning_rate": 1e-06, "loss": 0.0995, "num_tokens": 1395997880.0, "reward": 0.3832484483718872, "reward_std": 0.039947956800460815, "rewards/progression_diversity/mean": -0.017442844808101654, "rewards/progression_diversity/std": 0.06394115090370178, "rewards/symbolic_reward_accuracy/mean": 0.2734375, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.733154296875, "rewards/symbolic_reward_partial_score/std": 0.22632841765880585, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0048656463623047, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 43.669517517089844, "step": 2389 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2924426198005676, "epoch": 3.8301282051282053, "grad_norm": 1060.7529296875, "learning_rate": 1e-06, "loss": 0.3549, "step": 2390 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.34059783816337585, "epoch": 3.831730769230769, "grad_norm": 1.79716956615448, "learning_rate": 1e-06, "loss": 0.0341, "step": 2391 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3371441662311554, "epoch": 3.8333333333333335, "grad_norm": 15.651530265808105, "learning_rate": 1e-06, "loss": 0.0659, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2952.0, "completions/mean_length": 2100.259765625, "completions/mean_terminated_length": 1786.6446533203125, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "entropy": 0.3499729782342911, "epoch": 3.8349358974358974, "frac_reward_zero_std": 0.5625, "grad_norm": 0.016351254656910896, "learning_rate": 1e-06, "loss": 0.0287, "num_tokens": 1397980253.0, "reward": 0.4495462477207184, "reward_std": 0.04341159760951996, "rewards/progression_diversity/mean": -0.006314532831311226, "rewards/progression_diversity/std": 0.04331482574343681, "rewards/symbolic_reward_accuracy/mean": 0.361328125, "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, "rewards/symbolic_reward_partial_score/mean": 0.779296875, "rewards/symbolic_reward_partial_score/std": 0.22735320031642914, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0571317672729492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 16.10877227783203, "step": 2393 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3411282151937485, "epoch": 3.8365384615384617, "grad_norm": 8039.83837890625, "learning_rate": 1e-06, "loss": 0.8501, "step": 2394 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.356269970536232, "epoch": 3.8381410256410255, "grad_norm": 289.86322021484375, "learning_rate": 1e-06, "loss": 0.0269, "step": 2395 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3452070653438568, "epoch": 3.83974358974359, "grad_norm": 0.014756686054170132, "learning_rate": 1e-06, "loss": 0.0689, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4212.0, "completions/mean_length": 2003.048828125, "completions/mean_terminated_length": 1774.7799072265625, "completions/min_length": 998.0, "completions/min_terminated_length": 998.0, "entropy": 0.3432788699865341, "epoch": 3.8413461538461537, "frac_reward_zero_std": 0.4375, "grad_norm": 1194.415771484375, "learning_rate": 1e-06, "loss": 0.0683, "num_tokens": 1399852966.0, "reward": 0.3248797655105591, "reward_std": 0.023387808352708817, "rewards/progression_diversity/mean": -0.004212523810565472, "rewards/progression_diversity/std": 0.034703828394412994, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.7139322757720947, "rewards/symbolic_reward_partial_score/std": 0.2121579349040985, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0637973546981812, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 12.788324356079102, "step": 2397 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.36405032873153687, "epoch": 3.842948717948718, "grad_norm": 0.022353997454047203, "learning_rate": 1e-06, "loss": 0.0133, "step": 2398 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.35727696120738983, "epoch": 3.844551282051282, "grad_norm": 1.3744477033615112, "learning_rate": 1e-06, "loss": -0.002, "step": 2399 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.35195624828338623, "epoch": 3.8461538461538463, "grad_norm": 0.02140852063894272, "learning_rate": 1e-06, "loss": 0.0567, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 1852.724609375, "completions/mean_terminated_length": 1767.07861328125, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "entropy": 0.36357755959033966, "epoch": 3.84775641025641, "frac_reward_zero_std": 0.65625, "grad_norm": 0.02421417459845543, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 1401668985.0, "reward": 0.31430160999298096, "reward_std": 0.01744687370955944, "rewards/progression_diversity/mean": -0.001969876466318965, "rewards/progression_diversity/std": 0.026346473023295403, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.739794909954071, "rewards/symbolic_reward_partial_score/std": 0.19770273566246033, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0794093608856201, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 5.0978240966796875, "step": 2401 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.35801486670970917, "epoch": 3.8493589743589745, "grad_norm": 0.02062184363603592, "learning_rate": 1e-06, "loss": 36.5217, "step": 2402 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.36254678666591644, "epoch": 3.8509615384615383, "grad_norm": 0.024333849549293518, "learning_rate": 1e-06, "loss": 216.8933, "step": 2403 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3597784638404846, "epoch": 3.8525641025641026, "grad_norm": 0.016081588342785835, "learning_rate": 1e-06, "loss": 2.3758, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2842.0, "completions/mean_length": 1912.380859375, "completions/mean_terminated_length": 1798.43115234375, "completions/min_length": 1112.0, "completions/min_terminated_length": 1112.0, "entropy": 0.3635348379611969, "epoch": 3.8541666666666665, "frac_reward_zero_std": 0.53125, "grad_norm": 754.1912231445312, "learning_rate": 1e-06, "loss": 0.0195, "num_tokens": 1403545500.0, "reward": 0.31084516644477844, "reward_std": 0.025590339675545692, "rewards/progression_diversity/mean": -0.002397148869931698, "rewards/progression_diversity/std": 0.027250634506344795, "rewards/symbolic_reward_accuracy/mean": 0.17578125, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.6853190660476685, "rewards/symbolic_reward_partial_score/std": 0.19453242421150208, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0779601335525513, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 6.665762901306152, "step": 2405 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3639901280403137, "epoch": 3.855769230769231, "grad_norm": 0.02506587654352188, "learning_rate": 1e-06, "loss": 0.0106, "step": 2406 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.36371636390686035, "epoch": 3.8573717948717947, "grad_norm": 0.019767044112086296, "learning_rate": 1e-06, "loss": 0.0172, "step": 2407 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.37076058983802795, "epoch": 3.858974358974359, "grad_norm": 0.021978937089443207, "learning_rate": 1e-06, "loss": -0.0016, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 1938.568359375, "completions/mean_terminated_length": 1796.1085205078125, "completions/min_length": 1101.0, "completions/min_terminated_length": 1101.0, "entropy": 0.3613286018371582, "epoch": 3.8605769230769234, "frac_reward_zero_std": 0.5625, "grad_norm": 0.014481249265372753, "learning_rate": 1e-06, "loss": 0.0153, "num_tokens": 1405497455.0, "reward": 0.3004390001296997, "reward_std": 0.0335419736802578, "rewards/progression_diversity/mean": -0.0029758564196527004, "rewards/progression_diversity/std": 0.031783487647771835, "rewards/symbolic_reward_accuracy/mean": 0.14453125, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.7151041626930237, "rewards/symbolic_reward_partial_score/std": 0.19497309625148773, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.074684500694275, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 8.23452377319336, "step": 2409 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3613293021917343, "epoch": 3.8621794871794872, "grad_norm": 0.0199386365711689, "learning_rate": 1e-06, "loss": 0.0316, "step": 2410 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3616602122783661, "epoch": 3.863782051282051, "grad_norm": 0.5316094756126404, "learning_rate": 1e-06, "loss": 0.0207, "step": 2411 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.36158451437950134, "epoch": 3.8653846153846154, "grad_norm": 0.012657379731535912, "learning_rate": 1e-06, "loss": 0.0239, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3447.0, "completions/mean_length": 1951.318359375, "completions/mean_terminated_length": 1837.6751708984375, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "entropy": 0.3670198619365692, "epoch": 3.8669871794871797, "frac_reward_zero_std": 0.40625, "grad_norm": 0.029966576024889946, "learning_rate": 1e-06, "loss": 0.0243, "num_tokens": 1407303266.0, "reward": 0.29933494329452515, "reward_std": 0.02662837505340576, "rewards/progression_diversity/mean": -0.001564997830428183, "rewards/progression_diversity/std": 0.01767376810312271, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.693798840045929, "rewards/symbolic_reward_partial_score/std": 0.1757093071937561, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079904317855835, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 6.317461967468262, "step": 2413 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.380515456199646, "epoch": 3.8685897435897436, "grad_norm": 0.014633157290518284, "learning_rate": 1e-06, "loss": -0.0096, "step": 2414 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.36171650886535645, "epoch": 3.8701923076923075, "grad_norm": 0.019229549914598465, "learning_rate": 1e-06, "loss": 0.0416, "step": 2415 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3781423568725586, "epoch": 3.871794871794872, "grad_norm": 0.031127827242016792, "learning_rate": 1e-06, "loss": 0.0018, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 2027.650390625, "completions/mean_terminated_length": 1914.6082763671875, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "entropy": 0.37264350056648254, "epoch": 3.873397435897436, "frac_reward_zero_std": 0.4375, "grad_norm": 181.88681030273438, "learning_rate": 1e-06, "loss": 0.0241, "num_tokens": 1409189023.0, "reward": 0.33668702840805054, "reward_std": 0.035758331418037415, "rewards/progression_diversity/mean": -0.0017103657592087984, "rewards/progression_diversity/std": 0.020190343260765076, "rewards/symbolic_reward_accuracy/mean": 0.20703125, "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, "rewards/symbolic_reward_partial_score/mean": 0.7089356184005737, "rewards/symbolic_reward_partial_score/std": 0.2059522420167923, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079702377319336, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 5.716109275817871, "step": 2417 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3729843199253082, "epoch": 3.875, "grad_norm": 0.0172417052090168, "learning_rate": 1e-06, "loss": -0.0047, "step": 2418 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3554026186466217, "epoch": 3.876602564102564, "grad_norm": 0.014652607962489128, "learning_rate": 1e-06, "loss": 0.0365, "step": 2419 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.36666855216026306, "epoch": 3.878205128205128, "grad_norm": 0.025280388072133064, "learning_rate": 1e-06, "loss": 0.0004, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3165.0, "completions/mean_length": 2169.5234375, "completions/mean_terminated_length": 1972.4912109375, "completions/min_length": 1212.0, "completions/min_terminated_length": 1212.0, "entropy": 0.35489410161972046, "epoch": 3.8798076923076925, "frac_reward_zero_std": 0.5, "grad_norm": 993.462158203125, "learning_rate": 1e-06, "loss": 0.0898, "num_tokens": 1411120619.0, "reward": 0.3209129571914673, "reward_std": 0.036054469645023346, "rewards/progression_diversity/mean": -0.0024563795886933804, "rewards/progression_diversity/std": 0.022509947419166565, "rewards/symbolic_reward_accuracy/mean": 0.16796875, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.7371094226837158, "rewards/symbolic_reward_partial_score/std": 0.17446953058242798, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0719996690750122, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 9.719746589660645, "step": 2421 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.372598797082901, "epoch": 3.8814102564102564, "grad_norm": 0.00928138755261898, "learning_rate": 1e-06, "loss": -0.0118, "step": 2422 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3691015988588333, "epoch": 3.8830128205128203, "grad_norm": 0.018029429018497467, "learning_rate": 1e-06, "loss": 0.0338, "step": 2423 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.36332499980926514, "epoch": 3.8846153846153846, "grad_norm": 0.013046249747276306, "learning_rate": 1e-06, "loss": 0.0302, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3514.0, "completions/mean_length": 2295.619140625, "completions/mean_terminated_length": 1957.498046875, "completions/min_length": 1022.0, "completions/min_terminated_length": 1022.0, "entropy": 0.3619091063737869, "epoch": 3.886217948717949, "frac_reward_zero_std": 0.3125, "grad_norm": 1141.9002685546875, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 1413051752.0, "reward": 0.3810175061225891, "reward_std": 0.04954010993242264, "rewards/progression_diversity/mean": -0.00469579640775919, "rewards/progression_diversity/std": 0.03242946416139603, "rewards/symbolic_reward_accuracy/mean": 0.2421875, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.789746105670929, "rewards/symbolic_reward_partial_score/std": 0.18808871507644653, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0633162260055542, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 14.651718139648438, "step": 2425 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.36615803837776184, "epoch": 3.8878205128205128, "grad_norm": 833.3787231445312, "learning_rate": 1e-06, "loss": 0.0552, "step": 2426 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.37386661767959595, "epoch": 3.8894230769230766, "grad_norm": 0.020444680005311966, "learning_rate": 1e-06, "loss": 0.0178, "step": 2427 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3629365861415863, "epoch": 3.891025641025641, "grad_norm": 0.02197657711803913, "learning_rate": 1e-06, "loss": 0.0594, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3316.0, "completions/mean_length": 2237.09375, "completions/mean_terminated_length": 1983.9681396484375, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "entropy": 0.36241379380226135, "epoch": 3.8926282051282053, "frac_reward_zero_std": 0.53125, "grad_norm": 351.0287780761719, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 1415054008.0, "reward": 0.33530277013778687, "reward_std": 0.013850128278136253, "rewards/progression_diversity/mean": -0.0029261300805956125, "rewards/progression_diversity/std": 0.02486591413617134, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.7447265386581421, "rewards/symbolic_reward_partial_score/std": 0.18490070104599, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0711203813552856, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 10.22694206237793, "step": 2429 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3631092756986618, "epoch": 3.894230769230769, "grad_norm": 0.015336292795836926, "learning_rate": 1e-06, "loss": 0.0474, "step": 2430 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3701094090938568, "epoch": 3.8958333333333335, "grad_norm": 0.019072409719228745, "learning_rate": 1e-06, "loss": -0.0113, "step": 2431 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.36329878866672516, "epoch": 3.8974358974358974, "grad_norm": 0.009201987646520138, "learning_rate": 1e-06, "loss": 0.0567, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3290.0, "completions/mean_length": 2161.703125, "completions/mean_terminated_length": 1964.5623779296875, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "entropy": 0.36919447779655457, "epoch": 3.8990384615384617, "frac_reward_zero_std": 0.46875, "grad_norm": 651.1842041015625, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 1417071456.0, "reward": 0.26072680950164795, "reward_std": 0.03266744688153267, "rewards/progression_diversity/mean": -0.0025163046084344387, "rewards/progression_diversity/std": 0.024175414815545082, "rewards/symbolic_reward_accuracy/mean": 0.0859375, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.6992512941360474, "rewards/symbolic_reward_partial_score/std": 0.1867949366569519, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0748156309127808, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 8.038581848144531, "step": 2433 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3648574501276016, "epoch": 3.9006410256410255, "grad_norm": 0.026968425139784813, "learning_rate": 1e-06, "loss": 0.0404, "step": 2434 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3710328936576843, "epoch": 3.90224358974359, "grad_norm": 0.022169405594468117, "learning_rate": 1e-06, "loss": 0.0208, "step": 2435 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3560585528612137, "epoch": 3.9038461538461537, "grad_norm": 0.012255042791366577, "learning_rate": 1e-06, "loss": 0.0366, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3374.0, "completions/mean_length": 2268.74609375, "completions/mean_terminated_length": 1987.5657958984375, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "entropy": 0.36314675211906433, "epoch": 3.905448717948718, "frac_reward_zero_std": 0.59375, "grad_norm": 26.744871139526367, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 1419090318.0, "reward": 0.41112661361694336, "reward_std": 0.022687647491693497, "rewards/progression_diversity/mean": -0.0030626305378973484, "rewards/progression_diversity/std": 0.022572841495275497, "rewards/symbolic_reward_accuracy/mean": 0.306640625, "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, "rewards/symbolic_reward_partial_score/mean": 0.7591959238052368, "rewards/symbolic_reward_partial_score/std": 0.19385521113872528, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0674618482589722, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 11.076675415039062, "step": 2437 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3628970682621002, "epoch": 3.907051282051282, "grad_norm": 0.018419716507196426, "learning_rate": 1e-06, "loss": 0.0137, "step": 2438 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.34759269654750824, "epoch": 3.9086538461538463, "grad_norm": 0.008998063392937183, "learning_rate": 1e-06, "loss": 0.0594, "step": 2439 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3605688661336899, "epoch": 3.91025641025641, "grad_norm": 0.010430578142404556, "learning_rate": 1e-06, "loss": 0.0363, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3622.0, "completions/mean_length": 2239.3984375, "completions/mean_terminated_length": 2014.881103515625, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "entropy": 0.3595893830060959, "epoch": 3.9118589743589745, "frac_reward_zero_std": 0.5625, "grad_norm": 0.020290104672312737, "learning_rate": 1e-06, "loss": 0.0345, "num_tokens": 1421054490.0, "reward": 0.3346139192581177, "reward_std": 0.03494077920913696, "rewards/progression_diversity/mean": -0.002476356690749526, "rewards/progression_diversity/std": 0.022448500618338585, "rewards/symbolic_reward_accuracy/mean": 0.193359375, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.7313476800918579, "rewards/symbolic_reward_partial_score/std": 0.19797399640083313, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0738534927368164, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 8.171432495117188, "step": 2441 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3624930679798126, "epoch": 3.9134615384615383, "grad_norm": 0.01627454161643982, "learning_rate": 1e-06, "loss": 0.0318, "step": 2442 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3630795478820801, "epoch": 3.9150641025641026, "grad_norm": 0.011845182627439499, "learning_rate": 1e-06, "loss": 0.0527, "step": 2443 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3635891079902649, "epoch": 3.9166666666666665, "grad_norm": 0.012927143834531307, "learning_rate": 1e-06, "loss": 0.0315, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3619.0, "completions/mean_length": 2434.078125, "completions/mean_terminated_length": 2041.91162109375, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "entropy": 0.3447739779949188, "epoch": 3.918269230769231, "frac_reward_zero_std": 0.40625, "grad_norm": 580.8510131835938, "learning_rate": 1e-06, "loss": 0.0723, "num_tokens": 1423102786.0, "reward": 0.39957115054130554, "reward_std": 0.04453998804092407, "rewards/progression_diversity/mean": -0.00382626592181623, "rewards/progression_diversity/std": 0.02524666115641594, "rewards/symbolic_reward_accuracy/mean": 0.283203125, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.7708333134651184, "rewards/symbolic_reward_partial_score/std": 0.20369426906108856, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0627055168151855, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 13.541450500488281, "step": 2445 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3629473149776459, "epoch": 3.9198717948717947, "grad_norm": 0.03049568459391594, "learning_rate": 1e-06, "loss": 0.041, "step": 2446 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.360695943236351, "epoch": 3.921474358974359, "grad_norm": 0.03725721687078476, "learning_rate": 1e-06, "loss": 0.0375, "step": 2447 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.36237798631191254, "epoch": 3.9230769230769234, "grad_norm": 0.01814689300954342, "learning_rate": 1e-06, "loss": 0.0465, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4143.0, "completions/mean_length": 2448.4453125, "completions/mean_terminated_length": 2027.8551025390625, "completions/min_length": 1131.0, "completions/min_terminated_length": 1131.0, "entropy": 0.3400779664516449, "epoch": 3.9246794871794872, "frac_reward_zero_std": 0.46875, "grad_norm": 717.636962890625, "learning_rate": 1e-06, "loss": 0.1025, "num_tokens": 1425199558.0, "reward": 0.3190501034259796, "reward_std": 0.047453030943870544, "rewards/progression_diversity/mean": -0.0046582636423408985, "rewards/progression_diversity/std": 0.029829828068614006, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.7205566167831421, "rewards/symbolic_reward_partial_score/std": 0.21557864546775818, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0601270198822021, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 14.778732299804688, "step": 2449 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.36184197664260864, "epoch": 3.926282051282051, "grad_norm": 0.07136629521846771, "learning_rate": 1e-06, "loss": -0.0179, "step": 2450 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.36823903024196625, "epoch": 3.9278846153846154, "grad_norm": 0.012042498216032982, "learning_rate": 1e-06, "loss": 0.0099, "step": 2451 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3471096456050873, "epoch": 3.9294871794871797, "grad_norm": 0.020903823897242546, "learning_rate": 1e-06, "loss": 0.1012, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4468.0, "completions/mean_length": 2371.39453125, "completions/mean_terminated_length": 2006.336669921875, "completions/min_length": 1198.0, "completions/min_terminated_length": 1198.0, "entropy": 0.3484654426574707, "epoch": 3.9310897435897436, "frac_reward_zero_std": 0.4375, "grad_norm": 2592.323974609375, "learning_rate": 1e-06, "loss": 0.0898, "num_tokens": 1427341920.0, "reward": 0.32160401344299316, "reward_std": 0.039857640862464905, "rewards/progression_diversity/mean": -0.004153084009885788, "rewards/progression_diversity/std": 0.028649339452385902, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.729052722454071, "rewards/symbolic_reward_partial_score/std": 0.21416015923023224, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0638084411621094, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 12.2067232131958, "step": 2453 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.347993940114975, "epoch": 3.9326923076923075, "grad_norm": 0.02119533158838749, "learning_rate": 1e-06, "loss": 0.0476, "step": 2454 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3478931188583374, "epoch": 3.934294871794872, "grad_norm": 0.014371760189533234, "learning_rate": 1e-06, "loss": 0.0499, "step": 2455 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.35639145970344543, "epoch": 3.935897435897436, "grad_norm": 0.015084910206496716, "learning_rate": 1e-06, "loss": 0.0251, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3296.0, "completions/mean_length": 2630.390625, "completions/mean_terminated_length": 1953.9835205078125, "completions/min_length": 1124.0, "completions/min_terminated_length": 1124.0, "entropy": 0.34472502768039703, "epoch": 3.9375, "frac_reward_zero_std": 0.375, "grad_norm": 823.965576171875, "learning_rate": 1e-06, "loss": 0.0218, "num_tokens": 1429514888.0, "reward": 0.386918306350708, "reward_std": 0.04596348851919174, "rewards/progression_diversity/mean": -0.009828666225075722, "rewards/progression_diversity/std": 0.04648015275597572, "rewards/symbolic_reward_accuracy/mean": 0.271484375, "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, "rewards/symbolic_reward_partial_score/mean": 0.7503417730331421, "rewards/symbolic_reward_partial_score/std": 0.21974274516105652, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0469261407852173, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 20.604896545410156, "step": 2457 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3289848268032074, "epoch": 3.939102564102564, "grad_norm": 0.01477957796305418, "learning_rate": 1e-06, "loss": 0.131, "step": 2458 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3502272367477417, "epoch": 3.940705128205128, "grad_norm": 0.026974955573678017, "learning_rate": 1e-06, "loss": 0.0262, "step": 2459 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.34581419825553894, "epoch": 3.9423076923076925, "grad_norm": 0.015963274985551834, "learning_rate": 1e-06, "loss": 0.0406, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3357.0, "completions/mean_length": 2388.841796875, "completions/mean_terminated_length": 1937.385009765625, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "entropy": 0.35383833944797516, "epoch": 3.9439102564102564, "frac_reward_zero_std": 0.53125, "grad_norm": 648.1668090820312, "learning_rate": 1e-06, "loss": 0.0584, "num_tokens": 1431583207.0, "reward": 0.3101181387901306, "reward_std": 0.026755383238196373, "rewards/progression_diversity/mean": -0.00625104084610939, "rewards/progression_diversity/std": 0.03660469129681587, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.7318522334098816, "rewards/symbolic_reward_partial_score/std": 0.19400212168693542, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0573663711547852, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 16.051483154296875, "step": 2461 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.36202606558799744, "epoch": 3.9455128205128203, "grad_norm": 0.061913300305604935, "learning_rate": 1e-06, "loss": 0.015, "step": 2462 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.36147141456604004, "epoch": 3.9471153846153846, "grad_norm": 0.014589556492865086, "learning_rate": 1e-06, "loss": 0.0385, "step": 2463 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3382275700569153, "epoch": 3.948717948717949, "grad_norm": 0.013087283819913864, "learning_rate": 1e-06, "loss": 0.0554, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3255.0, "completions/mean_length": 2498.75, "completions/mean_terminated_length": 1963.61865234375, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "entropy": 0.35105569660663605, "epoch": 3.9503205128205128, "frac_reward_zero_std": 0.25, "grad_norm": 1137.3775634765625, "learning_rate": 1e-06, "loss": 0.0478, "num_tokens": 1433752439.0, "reward": 0.384078711271286, "reward_std": 0.03686286136507988, "rewards/progression_diversity/mean": -0.007657800801098347, "rewards/progression_diversity/std": 0.04040105640888214, "rewards/symbolic_reward_accuracy/mean": 0.275390625, "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, "rewards/symbolic_reward_partial_score/mean": 0.7329915165901184, "rewards/symbolic_reward_partial_score/std": 0.22255964577198029, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0503382682800293, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 19.593889236450195, "step": 2465 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3548099845647812, "epoch": 3.9519230769230766, "grad_norm": 0.07064145803451538, "learning_rate": 1e-06, "loss": 0.0108, "step": 2466 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3451448976993561, "epoch": 3.953525641025641, "grad_norm": 0.03678792715072632, "learning_rate": 1e-06, "loss": 0.0902, "step": 2467 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.34411998093128204, "epoch": 3.9551282051282053, "grad_norm": 0.026875555515289307, "learning_rate": 1e-06, "loss": 0.0816, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3373.0, "completions/mean_length": 2192.21484375, "completions/mean_terminated_length": 1909.510009765625, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "entropy": 0.3781355321407318, "epoch": 3.956730769230769, "frac_reward_zero_std": 0.5, "grad_norm": 0.01978994905948639, "learning_rate": 1e-06, "loss": -0.0087, "num_tokens": 1435768037.0, "reward": 0.2731976807117462, "reward_std": 0.02287241816520691, "rewards/progression_diversity/mean": -0.004451290238648653, "rewards/progression_diversity/std": 0.0341835618019104, "rewards/symbolic_reward_accuracy/mean": 0.123046875, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.6666666865348816, "rewards/symbolic_reward_partial_score/std": 0.2050163298845291, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0635859966278076, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 13.839681625366211, "step": 2469 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3631334751844406, "epoch": 3.9583333333333335, "grad_norm": 4.625295639038086, "learning_rate": 1e-06, "loss": 0.0434, "step": 2470 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.35326145589351654, "epoch": 3.9599358974358974, "grad_norm": 0.01084475964307785, "learning_rate": 1e-06, "loss": 0.0951, "step": 2471 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3633986711502075, "epoch": 3.9615384615384617, "grad_norm": 0.013056226074695587, "learning_rate": 1e-06, "loss": 0.0322, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 2316.728515625, "completions/mean_terminated_length": 1892.1629638671875, "completions/min_length": 1024.0, "completions/min_terminated_length": 1024.0, "entropy": 0.36079515516757965, "epoch": 3.9631410256410255, "frac_reward_zero_std": 0.3125, "grad_norm": 649.272216796875, "learning_rate": 1e-06, "loss": 0.0074, "num_tokens": 1437927290.0, "reward": 0.2825604975223541, "reward_std": 0.047691911458969116, "rewards/progression_diversity/mean": -0.007623001933097839, "rewards/progression_diversity/std": 0.04595175012946129, "rewards/symbolic_reward_accuracy/mean": 0.134765625, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.6758463382720947, "rewards/symbolic_reward_partial_score/std": 0.2019786536693573, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0513596534729004, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 19.716333389282227, "step": 2473 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3496604412794113, "epoch": 3.96474358974359, "grad_norm": 3.18587327003479, "learning_rate": 1e-06, "loss": 0.0646, "step": 2474 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3459862917661667, "epoch": 3.9663461538461537, "grad_norm": 0.016324063763022423, "learning_rate": 1e-06, "loss": 0.0452, "step": 2475 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.35571056604385376, "epoch": 3.967948717948718, "grad_norm": 0.014685800299048424, "learning_rate": 1e-06, "loss": 0.0737, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3300.0, "completions/mean_length": 2103.21484375, "completions/mean_terminated_length": 1876.5357666015625, "completions/min_length": 1079.0, "completions/min_terminated_length": 1079.0, "entropy": 0.36506491899490356, "epoch": 3.969551282051282, "frac_reward_zero_std": 0.53125, "grad_norm": 0.013999047689139843, "learning_rate": 1e-06, "loss": 0.0181, "num_tokens": 1439892248.0, "reward": 0.34133556485176086, "reward_std": 0.026946410536766052, "rewards/progression_diversity/mean": -0.0046281758695840836, "rewards/progression_diversity/std": 0.038301728665828705, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.7603353261947632, "rewards/symbolic_reward_partial_score/std": 0.17550498247146606, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0721997022628784, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 10.131376266479492, "step": 2477 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.36544330418109894, "epoch": 3.9711538461538463, "grad_norm": 0.02787620946764946, "learning_rate": 1e-06, "loss": 0.0459, "step": 2478 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3669802248477936, "epoch": 3.97275641025641, "grad_norm": 0.015974465757608414, "learning_rate": 1e-06, "loss": 0.0146, "step": 2479 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.36941133439540863, "epoch": 3.9743589743589745, "grad_norm": 0.02147599868476391, "learning_rate": 1e-06, "loss": 0.0229, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 2934.0, "completions/mean_length": 3074.95703125, "completions/mean_terminated_length": 1885.63818359375, "completions/min_length": 1061.0, "completions/min_terminated_length": 1061.0, "entropy": 0.3382083773612976, "epoch": 3.9759615384615383, "frac_reward_zero_std": 0.1875, "grad_norm": 1267.6973876953125, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 1442438978.0, "reward": 0.2858215570449829, "reward_std": 0.044375061988830566, "rewards/progression_diversity/mean": -0.023311004042625427, "rewards/progression_diversity/std": 0.08079320937395096, "rewards/symbolic_reward_accuracy/mean": 0.134765625, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.6885416507720947, "rewards/symbolic_reward_partial_score/std": 0.19891256093978882, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.005120038986206, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 43.875850677490234, "step": 2481 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3479975014925003, "epoch": 3.9775641025641026, "grad_norm": 0.0137171084061265, "learning_rate": 1e-06, "loss": 0.042, "step": 2482 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.31158269941806793, "epoch": 3.9791666666666665, "grad_norm": 0.028567982837557793, "learning_rate": 1e-06, "loss": 0.159, "step": 2483 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.31820471584796906, "epoch": 3.980769230769231, "grad_norm": 0.03962567821145058, "learning_rate": 1e-06, "loss": 0.1121, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3611.0, "completions/mean_length": 3046.513671875, "completions/mean_terminated_length": 1946.805419921875, "completions/min_length": 1105.0, "completions/min_terminated_length": 1105.0, "entropy": 0.33995670080184937, "epoch": 3.9823717948717947, "frac_reward_zero_std": 0.34375, "grad_norm": 649.2957763671875, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 1444918329.0, "reward": 0.3851238489151001, "reward_std": 0.020528025925159454, "rewards/progression_diversity/mean": -0.020821530371904373, "rewards/progression_diversity/std": 0.0761089101433754, "rewards/symbolic_reward_accuracy/mean": 0.28125, "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, "rewards/symbolic_reward_partial_score/mean": 0.7258462905883789, "rewards/symbolic_reward_partial_score/std": 0.23976966738700867, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0180208683013916, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 37.60797882080078, "step": 2485 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3226911425590515, "epoch": 3.983974358974359, "grad_norm": 3.0483925342559814, "learning_rate": 1e-06, "loss": 0.1089, "step": 2486 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3385407626628876, "epoch": 3.9855769230769234, "grad_norm": 0.01842661201953888, "learning_rate": 1e-06, "loss": 0.0844, "step": 2487 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3237884044647217, "epoch": 3.9871794871794872, "grad_norm": 0.01716519333422184, "learning_rate": 1e-06, "loss": 0.1273, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3178.0, "completions/mean_length": 3356.56640625, "completions/mean_terminated_length": 1883.89990234375, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "entropy": 0.33329902589321136, "epoch": 3.988782051282051, "frac_reward_zero_std": 0.34375, "grad_norm": 1143.9063720703125, "learning_rate": 1e-06, "loss": 0.085, "num_tokens": 1447458059.0, "reward": 0.48112747073173523, "reward_std": 0.07644528150558472, "rewards/progression_diversity/mean": -0.028856143355369568, "rewards/progression_diversity/std": 0.08837366849184036, "rewards/symbolic_reward_accuracy/mean": 0.4140625, "rewards/symbolic_reward_accuracy/std": 0.49304109811782837, "rewards/symbolic_reward_partial_score/mean": 0.7818033695220947, "rewards/symbolic_reward_partial_score/std": 0.23644909262657166, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0008317232131958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 46.746429443359375, "step": 2489 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3214826136827469, "epoch": 3.9903846153846154, "grad_norm": 32.26913833618164, "learning_rate": 1e-06, "loss": 0.0057, "step": 2490 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3154735267162323, "epoch": 3.9919871794871797, "grad_norm": 9.52782154083252, "learning_rate": 1e-06, "loss": 0.5166, "step": 2491 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.31883853673934937, "epoch": 3.9935897435897436, "grad_norm": 1062.50439453125, "learning_rate": 1e-06, "loss": 0.3673, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3200.0, "completions/mean_length": 3408.443359375, "completions/mean_terminated_length": 1941.6412353515625, "completions/min_length": 1132.0, "completions/min_terminated_length": 1132.0, "entropy": 0.3397838622331619, "epoch": 3.9951923076923075, "frac_reward_zero_std": 0.21875, "grad_norm": 321.0043640136719, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 1449967230.0, "reward": 0.4184061884880066, "reward_std": 0.0656353235244751, "rewards/progression_diversity/mean": -0.028522029519081116, "rewards/progression_diversity/std": 0.08747442811727524, "rewards/symbolic_reward_accuracy/mean": 0.3203125, "rewards/symbolic_reward_accuracy/std": 0.4670529365539551, "rewards/symbolic_reward_partial_score/mean": 0.7582682371139526, "rewards/symbolic_reward_partial_score/std": 0.22663599252700806, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0017540454864502, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 46.01112747192383, "step": 2493 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3056804835796356, "epoch": 3.996794871794872, "grad_norm": 35.7710075378418, "learning_rate": 1e-06, "loss": 0.2021, "step": 2494 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.33890417218208313, "epoch": 3.998397435897436, "grad_norm": 0.29032865166664124, "learning_rate": 1e-06, "loss": 0.0841, "step": 2495 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31833402812480927, "epoch": 4.0, "grad_norm": 0.019398123025894165, "learning_rate": 1e-06, "loss": 0.1324, "step": 2496 }, { "epoch": 4.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0859375, "eval_completions/max_length": 16384.0, "eval_completions/max_terminated_length": 2986.0625, "eval_completions/mean_length": 3210.553466796875, "eval_completions/mean_terminated_length": 1972.495346069336, "eval_completions/min_length": 1199.8125, "eval_completions/min_terminated_length": 1199.8125, "eval_entropy": 0.3062564991414547, "eval_frac_reward_zero_std": 0.23828125, "eval_loss": 0.03558493033051491, "eval_num_tokens": 1449967230.0, "eval_reward": 0.25646816566586494, "eval_reward_std": 0.03971813467796892, "eval_rewards/progression_diversity/mean": -0.022923253040062264, "eval_rewards/progression_diversity/std": 0.07360347534995526, "eval_rewards/symbolic_reward_accuracy/mean": 0.093994140625, "eval_rewards/symbolic_reward_accuracy/std": 0.21256664022803307, "eval_rewards/symbolic_reward_partial_score/mean": 0.6708434987813234, "eval_rewards/symbolic_reward_partial_score/std": 0.1871126431506127, "eval_rewards/tag_count_reward/mean": -0.009521484375, "eval_rewards/tag_count_reward/std": 0.07448925846256316, "eval_runtime": 4359.773, "eval_samples_per_second": 0.057, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.03790351934731, "eval_sampling/importance_sampling_ratio/min": 0.0, "eval_sampling/sampling_logp_difference/max": 782.375, "eval_sampling/sampling_logp_difference/mean": 27.384851962327957, "eval_steps_per_second": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3075.0, "completions/mean_length": 2979.962890625, "completions/mean_terminated_length": 1966.2122802734375, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "entropy": 0.32689496874809265, "epoch": 4.001602564102564, "frac_reward_zero_std": 0.34375, "grad_norm": 373.3982849121094, "learning_rate": 1e-06, "loss": 0.0951, "num_tokens": 1452348555.0, "reward": 0.3907577395439148, "reward_std": 0.044927872717380524, "rewards/progression_diversity/mean": -0.018952488899230957, "rewards/progression_diversity/std": 0.0710325539112091, "rewards/symbolic_reward_accuracy/mean": 0.271484375, "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, "rewards/symbolic_reward_partial_score/mean": 0.7653970718383789, "rewards/symbolic_reward_partial_score/std": 0.23204264044761658, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0241422653198242, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 34.15648651123047, "step": 2497 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3391586244106293, "epoch": 4.003205128205129, "grad_norm": 0.013818570412695408, "learning_rate": 1e-06, "loss": 0.2369, "step": 2498 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.32884329557418823, "epoch": 4.0048076923076925, "grad_norm": 7747.47119140625, "learning_rate": 1e-06, "loss": 1.4664, "step": 2499 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3341674506664276, "epoch": 4.006410256410256, "grad_norm": 0.5125430822372437, "learning_rate": 1e-06, "loss": 0.2162, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3270.0, "completions/mean_length": 3343.3125, "completions/mean_terminated_length": 1963.196533203125, "completions/min_length": 1081.0, "completions/min_terminated_length": 1081.0, "entropy": 0.3030368685722351, "epoch": 4.00801282051282, "frac_reward_zero_std": 0.25, "grad_norm": 691.9620971679688, "learning_rate": 1e-06, "loss": 0.1049, "num_tokens": 1454962379.0, "reward": 0.34921398758888245, "reward_std": 0.05184464529156685, "rewards/progression_diversity/mean": -0.02537970244884491, "rewards/progression_diversity/std": 0.08060704171657562, "rewards/symbolic_reward_accuracy/mean": 0.244140625, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.6792155504226685, "rewards/symbolic_reward_partial_score/std": 0.23507006466388702, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0067884922027588, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 43.07496643066406, "step": 2501 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3297167718410492, "epoch": 4.009615384615385, "grad_norm": 0.07673323899507523, "learning_rate": 1e-06, "loss": 0.0893, "step": 2502 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3251052349805832, "epoch": 4.011217948717949, "grad_norm": 0.011674604378640652, "learning_rate": 1e-06, "loss": 0.1128, "step": 2503 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3353799283504486, "epoch": 4.012820512820513, "grad_norm": 0.029069218784570694, "learning_rate": 1e-06, "loss": 0.0488, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3116.0, "completions/mean_length": 3221.0078125, "completions/mean_terminated_length": 1983.461669921875, "completions/min_length": 1074.0, "completions/min_terminated_length": 1074.0, "entropy": 0.3239881694316864, "epoch": 4.014423076923077, "frac_reward_zero_std": 0.1875, "grad_norm": 496.55010986328125, "learning_rate": 1e-06, "loss": 0.0835, "num_tokens": 1457508255.0, "reward": 0.38050615787506104, "reward_std": 0.04678242653608322, "rewards/progression_diversity/mean": -0.02165273018181324, "rewards/progression_diversity/std": 0.07281038910150528, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.740429699420929, "rewards/symbolic_reward_partial_score/std": 0.20741601288318634, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0144760608673096, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 38.09929656982422, "step": 2505 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.29394128918647766, "epoch": 4.016025641025641, "grad_norm": 0.03365239128470421, "learning_rate": 1e-06, "loss": 0.1895, "step": 2506 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3248167932033539, "epoch": 4.017628205128205, "grad_norm": 0.03242243453860283, "learning_rate": 1e-06, "loss": 0.0367, "step": 2507 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3440161794424057, "epoch": 4.019230769230769, "grad_norm": 0.023058878257870674, "learning_rate": 1e-06, "loss": 0.0453, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3390.0, "completions/mean_length": 2804.220703125, "completions/mean_terminated_length": 1988.8717041015625, "completions/min_length": 1169.0, "completions/min_terminated_length": 1169.0, "entropy": 0.3260979652404785, "epoch": 4.020833333333333, "frac_reward_zero_std": 0.4375, "grad_norm": 859.6063842773438, "learning_rate": 1e-06, "loss": 0.0866, "num_tokens": 1459937296.0, "reward": 0.23110386729240417, "reward_std": 0.02382086217403412, "rewards/progression_diversity/mean": -0.015101805329322815, "rewards/progression_diversity/std": 0.06296249479055405, "rewards/symbolic_reward_accuracy/mean": 0.05859375, "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, "rewards/symbolic_reward_partial_score/mean": 0.6582194566726685, "rewards/symbolic_reward_partial_score/std": 0.18645817041397095, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.035254716873169, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 27.520709991455078, "step": 2509 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.34093551337718964, "epoch": 4.022435897435898, "grad_norm": 2172.674560546875, "learning_rate": 1e-06, "loss": 0.2025, "step": 2510 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3240719437599182, "epoch": 4.024038461538462, "grad_norm": 67.2519760131836, "learning_rate": 1e-06, "loss": 0.0682, "step": 2511 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3448804020881653, "epoch": 4.0256410256410255, "grad_norm": 0.01443031057715416, "learning_rate": 1e-06, "loss": 0.0467, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.033203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3084.0, "completions/mean_length": 2421.150390625, "completions/mean_terminated_length": 1941.6182861328125, "completions/min_length": 1169.0, "completions/min_terminated_length": 1169.0, "entropy": 0.3457055538892746, "epoch": 4.027243589743589, "frac_reward_zero_std": 0.59375, "grad_norm": 373.1669616699219, "learning_rate": 1e-06, "loss": 0.0384, "num_tokens": 1462033373.0, "reward": 0.36610591411590576, "reward_std": 0.008678528480231762, "rewards/progression_diversity/mean": -0.008554144762456417, "rewards/progression_diversity/std": 0.04716269671916962, "rewards/symbolic_reward_accuracy/mean": 0.25, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.7212890386581421, "rewards/symbolic_reward_partial_score/std": 0.2121674120426178, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0559264421463013, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 17.93334197998047, "step": 2513 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.35698336362838745, "epoch": 4.028846153846154, "grad_norm": 0.10000839084386826, "learning_rate": 1e-06, "loss": 0.022, "step": 2514 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3595133423805237, "epoch": 4.030448717948718, "grad_norm": 0.02287139743566513, "learning_rate": 1e-06, "loss": 0.0303, "step": 2515 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.34900110960006714, "epoch": 4.032051282051282, "grad_norm": 0.011496487073600292, "learning_rate": 1e-06, "loss": 0.0666, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3091.0, "completions/mean_length": 2512.10546875, "completions/mean_terminated_length": 1977.48876953125, "completions/min_length": 1222.0, "completions/min_terminated_length": 1222.0, "entropy": 0.3486749231815338, "epoch": 4.033653846153846, "frac_reward_zero_std": 0.375, "grad_norm": 842.0504150390625, "learning_rate": 1e-06, "loss": 0.0281, "num_tokens": 1464206211.0, "reward": 0.31601575016975403, "reward_std": 0.02459620125591755, "rewards/progression_diversity/mean": -0.00828959047794342, "rewards/progression_diversity/std": 0.04331068694591522, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.7470214366912842, "rewards/symbolic_reward_partial_score/std": 0.18681012094020844, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0497366189956665, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 21.2126522064209, "step": 2517 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3538134843111038, "epoch": 4.035256410256411, "grad_norm": 0.03899611532688141, "learning_rate": 1e-06, "loss": 0.0486, "step": 2518 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3479982316493988, "epoch": 4.0368589743589745, "grad_norm": 0.013328113593161106, "learning_rate": 1e-06, "loss": 0.0789, "step": 2519 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.35109949111938477, "epoch": 4.038461538461538, "grad_norm": 0.015972906723618507, "learning_rate": 1e-06, "loss": 0.0823, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3298.0, "completions/mean_length": 2378.287109375, "completions/mean_terminated_length": 1867.95751953125, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "entropy": 0.35145680606365204, "epoch": 4.040064102564102, "frac_reward_zero_std": 0.53125, "grad_norm": 592.3827514648438, "learning_rate": 1e-06, "loss": 0.0507, "num_tokens": 1466305798.0, "reward": 0.3080458343029022, "reward_std": 0.013700053095817566, "rewards/progression_diversity/mean": -0.00889340415596962, "rewards/progression_diversity/std": 0.04732125252485275, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.7198241949081421, "rewards/symbolic_reward_partial_score/std": 0.19643016159534454, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0504951477050781, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 20.981121063232422, "step": 2521 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3594615310430527, "epoch": 4.041666666666667, "grad_norm": 0.03896607458591461, "learning_rate": 1e-06, "loss": 0.017, "step": 2522 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.34222954511642456, "epoch": 4.043269230769231, "grad_norm": 0.006857524160295725, "learning_rate": 1e-06, "loss": 0.0644, "step": 2523 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.35084451735019684, "epoch": 4.044871794871795, "grad_norm": 0.009686310775578022, "learning_rate": 1e-06, "loss": 0.0258, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 2450.3046875, "completions/mean_terminated_length": 1824.7100830078125, "completions/min_length": 1064.0, "completions/min_terminated_length": 1064.0, "entropy": 0.3546312600374222, "epoch": 4.046474358974359, "frac_reward_zero_std": 0.53125, "grad_norm": 310.8623962402344, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 1468479282.0, "reward": 0.38561779260635376, "reward_std": 0.018528560176491737, "rewards/progression_diversity/mean": -0.010976451449096203, "rewards/progression_diversity/std": 0.05261799693107605, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.7551594972610474, "rewards/symbolic_reward_partial_score/std": 0.20599855482578278, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0471153259277344, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 22.519685745239258, "step": 2525 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3387318253517151, "epoch": 4.048076923076923, "grad_norm": 0.017549673095345497, "learning_rate": 1e-06, "loss": 0.1121, "step": 2526 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3566637337207794, "epoch": 4.049679487179487, "grad_norm": 0.025295212864875793, "learning_rate": 1e-06, "loss": 0.0235, "step": 2527 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.33428405225276947, "epoch": 4.051282051282051, "grad_norm": 0.019428903236985207, "learning_rate": 1e-06, "loss": 0.0258, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 3127.302734375, "completions/mean_terminated_length": 1787.3785400390625, "completions/min_length": 1011.0, "completions/min_terminated_length": 1011.0, "entropy": 0.3295024484395981, "epoch": 4.052884615384615, "frac_reward_zero_std": 0.34375, "grad_norm": 1348.5477294921875, "learning_rate": 1e-06, "loss": 0.057, "num_tokens": 1470886541.0, "reward": 0.28540563583374023, "reward_std": 0.019026556983590126, "rewards/progression_diversity/mean": -0.02584357187151909, "rewards/progression_diversity/std": 0.08347862958908081, "rewards/symbolic_reward_accuracy/mean": 0.119140625, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.7152343988418579, "rewards/symbolic_reward_partial_score/std": 0.19548086822032928, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.010728359222412, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 41.08893585205078, "step": 2529 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.30960477888584137, "epoch": 4.05448717948718, "grad_norm": 96.72750091552734, "learning_rate": 1e-06, "loss": 0.0919, "step": 2530 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.32376880943775177, "epoch": 4.056089743589744, "grad_norm": 0.019057314842939377, "learning_rate": 1e-06, "loss": 0.0911, "step": 2531 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.32927900552749634, "epoch": 4.0576923076923075, "grad_norm": 0.010551064275205135, "learning_rate": 1e-06, "loss": 0.1108, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 2688.408203125, "completions/mean_terminated_length": 1775.368896484375, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "entropy": 0.34093132615089417, "epoch": 4.059294871794871, "frac_reward_zero_std": 0.375, "grad_norm": 454.11712646484375, "learning_rate": 1e-06, "loss": 0.0975, "num_tokens": 1473027214.0, "reward": 0.42107725143432617, "reward_std": 0.030925702303647995, "rewards/progression_diversity/mean": -0.01678628847002983, "rewards/progression_diversity/std": 0.06638278067111969, "rewards/symbolic_reward_accuracy/mean": 0.3046875, "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, "rewards/symbolic_reward_partial_score/mean": 0.7967284917831421, "rewards/symbolic_reward_partial_score/std": 0.18162044882774353, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0306940078735352, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 30.967906951904297, "step": 2533 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.32887102663517, "epoch": 4.060897435897436, "grad_norm": 0.00896961148828268, "learning_rate": 1e-06, "loss": 0.1063, "step": 2534 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3550725132226944, "epoch": 4.0625, "grad_norm": 0.014958408661186695, "learning_rate": 1e-06, "loss": 0.069, "step": 2535 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.34709444642066956, "epoch": 4.064102564102564, "grad_norm": 0.013090104795992374, "learning_rate": 1e-06, "loss": 0.0225, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.087890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 3024.65625, "completions/mean_terminated_length": 1737.353271484375, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "entropy": 0.3089066445827484, "epoch": 4.065705128205129, "frac_reward_zero_std": 0.4375, "grad_norm": 508.8141174316406, "learning_rate": 1e-06, "loss": 0.0951, "num_tokens": 1475532462.0, "reward": 0.31843122839927673, "reward_std": 0.011976012028753757, "rewards/progression_diversity/mean": -0.025041041895747185, "rewards/progression_diversity/std": 0.08240573108196259, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.6918293833732605, "rewards/symbolic_reward_partial_score/std": 0.20478877425193787, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0157009363174438, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 37.54655075073242, "step": 2537 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.31688109040260315, "epoch": 4.0673076923076925, "grad_norm": 1613.686279296875, "learning_rate": 1e-06, "loss": 0.1161, "step": 2538 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30626846849918365, "epoch": 4.068910256410256, "grad_norm": 0.013836408033967018, "learning_rate": 1e-06, "loss": 0.0784, "step": 2539 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.3128751218318939, "epoch": 4.07051282051282, "grad_norm": 0.025374021381139755, "learning_rate": 1e-06, "loss": 0.0991, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2768.0, "completions/mean_length": 2856.404296875, "completions/mean_terminated_length": 1741.0211181640625, "completions/min_length": 999.0, "completions/min_terminated_length": 999.0, "entropy": 0.3115910291671753, "epoch": 4.072115384615385, "frac_reward_zero_std": 0.34375, "grad_norm": 627.9964599609375, "learning_rate": 1e-06, "loss": 0.1078, "num_tokens": 1477984861.0, "reward": 0.29800140857696533, "reward_std": 0.011926448903977871, "rewards/progression_diversity/mean": -0.02163849025964737, "rewards/progression_diversity/std": 0.07751591503620148, "rewards/symbolic_reward_accuracy/mean": 0.123046875, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.7486165761947632, "rewards/symbolic_reward_partial_score/std": 0.1641637235879898, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0106605291366577, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 40.304405212402344, "step": 2541 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3093924969434738, "epoch": 4.073717948717949, "grad_norm": 0.008785123936831951, "learning_rate": 1e-06, "loss": 0.0982, "step": 2542 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.33030666410923004, "epoch": 4.075320512820513, "grad_norm": 0.017673548310995102, "learning_rate": 1e-06, "loss": 0.0605, "step": 2543 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3170679658651352, "epoch": 4.076923076923077, "grad_norm": 0.013326681219041348, "learning_rate": 1e-06, "loss": 0.0929, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 2784.251953125, "completions/mean_terminated_length": 1724.903076171875, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "entropy": 0.3214297592639923, "epoch": 4.078525641025641, "frac_reward_zero_std": 0.375, "grad_norm": 304.0992431640625, "learning_rate": 1e-06, "loss": 0.076, "num_tokens": 1480289838.0, "reward": 0.3872213363647461, "reward_std": 0.013941626995801926, "rewards/progression_diversity/mean": -0.0229843370616436, "rewards/progression_diversity/std": 0.08313606679439545, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.79931640625, "rewards/symbolic_reward_partial_score/std": 0.18186363577842712, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0218315124511719, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 34.574668884277344, "step": 2545 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3198620527982712, "epoch": 4.080128205128205, "grad_norm": 0.009517377242445946, "learning_rate": 1e-06, "loss": 0.0768, "step": 2546 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.30512526631355286, "epoch": 4.081730769230769, "grad_norm": 0.018004372715950012, "learning_rate": 1e-06, "loss": 0.1049, "step": 2547 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3438553065061569, "epoch": 4.083333333333333, "grad_norm": 0.025911420583724976, "learning_rate": 1e-06, "loss": 0.059, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2753.0, "completions/mean_length": 2879.19921875, "completions/mean_terminated_length": 1765.695556640625, "completions/min_length": 1122.0, "completions/min_terminated_length": 1122.0, "entropy": 0.3178988993167877, "epoch": 4.084935897435898, "frac_reward_zero_std": 0.25, "grad_norm": 239.32733154296875, "learning_rate": 1e-06, "loss": 0.0345, "num_tokens": 1482682692.0, "reward": 0.35247373580932617, "reward_std": 0.008235493674874306, "rewards/progression_diversity/mean": -0.025090141221880913, "rewards/progression_diversity/std": 0.08833190053701401, "rewards/symbolic_reward_accuracy/mean": 0.21875, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.7382487058639526, "rewards/symbolic_reward_partial_score/std": 0.19196312129497528, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0059094429016113, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 43.50497055053711, "step": 2549 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.32996954023838043, "epoch": 4.086538461538462, "grad_norm": 0.013742972165346146, "learning_rate": 1e-06, "loss": 0.0517, "step": 2550 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31920306384563446, "epoch": 4.0881410256410255, "grad_norm": 0.011623159982264042, "learning_rate": 1e-06, "loss": 0.1203, "step": 2551 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30411607027053833, "epoch": 4.089743589743589, "grad_norm": 0.01574169658124447, "learning_rate": 1e-06, "loss": 0.1498, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 2363.705078125, "completions/mean_terminated_length": 1764.0592041015625, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "entropy": 0.3521651029586792, "epoch": 4.091346153846154, "frac_reward_zero_std": 0.375, "grad_norm": 662.1588745117188, "learning_rate": 1e-06, "loss": 0.0482, "num_tokens": 1484783181.0, "reward": 0.26874637603759766, "reward_std": 0.013555062934756279, "rewards/progression_diversity/mean": -0.014036715030670166, "rewards/progression_diversity/std": 0.06895329058170319, "rewards/symbolic_reward_accuracy/mean": 0.09375, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.7100911140441895, "rewards/symbolic_reward_partial_score/std": 0.17183727025985718, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0332553386688232, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 29.85314178466797, "step": 2553 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.34595395624637604, "epoch": 4.092948717948718, "grad_norm": 4.502384185791016, "learning_rate": 1e-06, "loss": 0.0216, "step": 2554 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3410833477973938, "epoch": 4.094551282051282, "grad_norm": 0.011607046239078045, "learning_rate": 1e-06, "loss": 0.0667, "step": 2555 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.33509866893291473, "epoch": 4.096153846153846, "grad_norm": 0.014269710518419743, "learning_rate": 1e-06, "loss": 0.0963, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3428.0, "completions/mean_length": 2880.39453125, "completions/mean_terminated_length": 1859.113525390625, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "entropy": 0.32943619787693024, "epoch": 4.097756410256411, "frac_reward_zero_std": 0.375, "grad_norm": 1257.3856201171875, "learning_rate": 1e-06, "loss": 0.0543, "num_tokens": 1487087367.0, "reward": 0.3605473041534424, "reward_std": 0.030462127178907394, "rewards/progression_diversity/mean": -0.02095445990562439, "rewards/progression_diversity/std": 0.07725287228822708, "rewards/symbolic_reward_accuracy/mean": 0.23046875, "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, "rewards/symbolic_reward_partial_score/mean": 0.7422363758087158, "rewards/symbolic_reward_partial_score/std": 0.19137205183506012, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0257246494293213, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 34.513587951660156, "step": 2557 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3395875543355942, "epoch": 4.0993589743589745, "grad_norm": 1.989341139793396, "learning_rate": 1e-06, "loss": 0.0551, "step": 2558 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.33270154893398285, "epoch": 4.100961538461538, "grad_norm": 0.0756467655301094, "learning_rate": 1e-06, "loss": 0.1137, "step": 2559 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3363744169473648, "epoch": 4.102564102564102, "grad_norm": 0.012534077279269695, "learning_rate": 1e-06, "loss": 0.0671, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3092.0, "completions/mean_length": 2916.15234375, "completions/mean_terminated_length": 1927.9454345703125, "completions/min_length": 1158.0, "completions/min_terminated_length": 1158.0, "entropy": 0.33795928955078125, "epoch": 4.104166666666667, "frac_reward_zero_std": 0.40625, "grad_norm": 486.66265869140625, "learning_rate": 1e-06, "loss": 0.0406, "num_tokens": 1489486389.0, "reward": 0.3729179799556732, "reward_std": 0.019203029572963715, "rewards/progression_diversity/mean": -0.019237220287322998, "rewards/progression_diversity/std": 0.07287276536226273, "rewards/symbolic_reward_accuracy/mean": 0.248046875, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.7495605945587158, "rewards/symbolic_reward_partial_score/std": 0.2092960774898529, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0225183963775635, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 35.221961975097656, "step": 2561 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.33110561966896057, "epoch": 4.105769230769231, "grad_norm": 13.403287887573242, "learning_rate": 1e-06, "loss": 0.0302, "step": 2562 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3245306611061096, "epoch": 4.107371794871795, "grad_norm": 0.013325260020792484, "learning_rate": 1e-06, "loss": 0.1297, "step": 2563 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3338320553302765, "epoch": 4.108974358974359, "grad_norm": 1.518842101097107, "learning_rate": 1e-06, "loss": 0.0908, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3086.0, "completions/mean_length": 3535.826171875, "completions/mean_terminated_length": 1989.547119140625, "completions/min_length": 1126.0, "completions/min_terminated_length": 1126.0, "entropy": 0.31208691000938416, "epoch": 4.110576923076923, "frac_reward_zero_std": 0.1875, "grad_norm": 2256.84619140625, "learning_rate": 1e-06, "loss": 0.1228, "num_tokens": 1492263788.0, "reward": 0.3425925374031067, "reward_std": 0.025561640039086342, "rewards/progression_diversity/mean": -0.030298512428998947, "rewards/progression_diversity/std": 0.08967959880828857, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.7237141728401184, "rewards/symbolic_reward_partial_score/std": 0.20818644762039185, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9913551807403564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 50.347373962402344, "step": 2565 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.3200324475765228, "epoch": 4.112179487179487, "grad_norm": 1.371201515197754, "learning_rate": 1e-06, "loss": 0.1, "step": 2566 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.30504705011844635, "epoch": 4.113782051282051, "grad_norm": 2.302720308303833, "learning_rate": 1e-06, "loss": 0.1468, "step": 2567 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3112814277410507, "epoch": 4.115384615384615, "grad_norm": 7566.38720703125, "learning_rate": 1e-06, "loss": 0.2698, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3291.0, "completions/mean_length": 3172.8828125, "completions/mean_terminated_length": 2022.87060546875, "completions/min_length": 1223.0, "completions/min_terminated_length": 1223.0, "entropy": 0.35531145334243774, "epoch": 4.11698717948718, "frac_reward_zero_std": 0.15625, "grad_norm": 797.2550048828125, "learning_rate": 1e-06, "loss": 0.0721, "num_tokens": 1494671600.0, "reward": 0.32184159755706787, "reward_std": 0.0444670133292675, "rewards/progression_diversity/mean": -0.02189837582409382, "rewards/progression_diversity/std": 0.07626625150442123, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.7167643308639526, "rewards/symbolic_reward_partial_score/std": 0.23638619482517242, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.007836103439331, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 40.60625457763672, "step": 2569 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3130183517932892, "epoch": 4.118589743589744, "grad_norm": 21.2623291015625, "learning_rate": 1e-06, "loss": 0.0972, "step": 2570 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3290316015481949, "epoch": 4.1201923076923075, "grad_norm": 1913.60205078125, "learning_rate": 1e-06, "loss": 0.2226, "step": 2571 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3343227505683899, "epoch": 4.121794871794871, "grad_norm": 0.022048160433769226, "learning_rate": 1e-06, "loss": 0.1135, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 3387.615234375, "completions/mean_terminated_length": 2043.16162109375, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "entropy": 0.3383503258228302, "epoch": 4.123397435897436, "frac_reward_zero_std": 0.25, "grad_norm": 572.839111328125, "learning_rate": 1e-06, "loss": 0.1123, "num_tokens": 1497270795.0, "reward": 0.41935285925865173, "reward_std": 0.022548135370016098, "rewards/progression_diversity/mean": -0.02565228007733822, "rewards/progression_diversity/std": 0.08250828087329865, "rewards/symbolic_reward_accuracy/mean": 0.310546875, "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, "rewards/symbolic_reward_partial_score/mean": 0.7795573472976685, "rewards/symbolic_reward_partial_score/std": 0.20047199726104736, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0107229948043823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 37.779056549072266, "step": 2573 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3201807737350464, "epoch": 4.125, "grad_norm": 0.01932665705680847, "learning_rate": 1e-06, "loss": 0.1213, "step": 2574 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.31810304522514343, "epoch": 4.126602564102564, "grad_norm": 0.022387662902474403, "learning_rate": 1e-06, "loss": 0.1087, "step": 2575 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3269665241241455, "epoch": 4.128205128205128, "grad_norm": 0.013768312521278858, "learning_rate": 1e-06, "loss": 0.1118, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3283.0, "completions/mean_length": 2913.330078125, "completions/mean_terminated_length": 2015.2855224609375, "completions/min_length": 1204.0, "completions/min_terminated_length": 1204.0, "entropy": 0.3411347270011902, "epoch": 4.1298076923076925, "frac_reward_zero_std": 0.25, "grad_norm": 542.296875, "learning_rate": 1e-06, "loss": 0.0579, "num_tokens": 1499596596.0, "reward": 0.2602491080760956, "reward_std": 0.025792162865400314, "rewards/progression_diversity/mean": -0.015127346850931644, "rewards/progression_diversity/std": 0.06053609028458595, "rewards/symbolic_reward_accuracy/mean": 0.095703125, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.6778970956802368, "rewards/symbolic_reward_partial_score/std": 0.19333595037460327, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0344008207321167, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 27.88906478881836, "step": 2577 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3439747095108032, "epoch": 4.131410256410256, "grad_norm": 0.02023777738213539, "learning_rate": 1e-06, "loss": 0.0927, "step": 2578 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.34476563334465027, "epoch": 4.13301282051282, "grad_norm": 7.5225725173950195, "learning_rate": 1e-06, "loss": 0.0408, "step": 2579 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3402794599533081, "epoch": 4.134615384615385, "grad_norm": 0.021249419078230858, "learning_rate": 1e-06, "loss": 0.0551, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3805.0, "completions/mean_length": 2671.884765625, "completions/mean_terminated_length": 1967.9775390625, "completions/min_length": 597.0, "completions/min_terminated_length": 597.0, "entropy": 0.33940988779067993, "epoch": 4.136217948717949, "frac_reward_zero_std": 0.21875, "grad_norm": 498.6994323730469, "learning_rate": 1e-06, "loss": 0.0785, "num_tokens": 1501793977.0, "reward": 0.3605765700340271, "reward_std": 0.06087111681699753, "rewards/progression_diversity/mean": -0.01119298581033945, "rewards/progression_diversity/std": 0.05226728320121765, "rewards/symbolic_reward_accuracy/mean": 0.2265625, "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, "rewards/symbolic_reward_partial_score/mean": 0.7517740726470947, "rewards/symbolic_reward_partial_score/std": 0.18635277450084686, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0438597202301025, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 23.59453582763672, "step": 2581 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3429712653160095, "epoch": 4.137820512820513, "grad_norm": 472.5995788574219, "learning_rate": 1e-06, "loss": 0.0979, "step": 2582 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3512907773256302, "epoch": 4.139423076923077, "grad_norm": 0.022063203155994415, "learning_rate": 1e-06, "loss": 0.0575, "step": 2583 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.36512602865695953, "epoch": 4.141025641025641, "grad_norm": 0.027568161487579346, "learning_rate": 1e-06, "loss": 0.043, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3768.0, "completions/mean_length": 2782.953125, "completions/mean_terminated_length": 1966.3271484375, "completions/min_length": 1182.0, "completions/min_terminated_length": 1182.0, "entropy": 0.33962003886699677, "epoch": 4.142628205128205, "frac_reward_zero_std": 0.3125, "grad_norm": 1543.4376220703125, "learning_rate": 1e-06, "loss": 0.0607, "num_tokens": 1504024065.0, "reward": 0.3242529630661011, "reward_std": 0.023926857858896255, "rewards/progression_diversity/mean": -0.014647168107330799, "rewards/progression_diversity/std": 0.0613836906850338, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.716748058795929, "rewards/symbolic_reward_partial_score/std": 0.1955670863389969, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.031816005706787, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 30.275238037109375, "step": 2585 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.33402132987976074, "epoch": 4.144230769230769, "grad_norm": 144.89903259277344, "learning_rate": 1e-06, "loss": 1.3615, "step": 2586 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3640512526035309, "epoch": 4.145833333333333, "grad_norm": 0.025912059471011162, "learning_rate": 1e-06, "loss": 0.0157, "step": 2587 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.346598282456398, "epoch": 4.147435897435898, "grad_norm": 0.017162565141916275, "learning_rate": 1e-06, "loss": 0.065, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3701.0, "completions/mean_length": 3073.26953125, "completions/mean_terminated_length": 2006.1644287109375, "completions/min_length": 1134.0, "completions/min_terminated_length": 1134.0, "entropy": 0.32446981966495514, "epoch": 4.149038461538462, "frac_reward_zero_std": 0.28125, "grad_norm": 570.8603515625, "learning_rate": 1e-06, "loss": 0.1123, "num_tokens": 1506499115.0, "reward": 0.3911179006099701, "reward_std": 0.021029271185398102, "rewards/progression_diversity/mean": -0.018583200871944427, "rewards/progression_diversity/std": 0.06753162294626236, "rewards/symbolic_reward_accuracy/mean": 0.27734375, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.7516113519668579, "rewards/symbolic_reward_partial_score/std": 0.21333743631839752, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0214645862579346, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 35.46625900268555, "step": 2589 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.35942013561725616, "epoch": 4.1506410256410255, "grad_norm": 4977.77294921875, "learning_rate": 1e-06, "loss": 0.2231, "step": 2590 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3404110074043274, "epoch": 4.152243589743589, "grad_norm": 97.81608581542969, "learning_rate": 1e-06, "loss": 0.1382, "step": 2591 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3034721314907074, "epoch": 4.153846153846154, "grad_norm": 10592.0458984375, "learning_rate": 1e-06, "loss": 0.6398, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3079.0, "completions/mean_length": 3393.611328125, "completions/mean_terminated_length": 1925.132568359375, "completions/min_length": 1143.0, "completions/min_terminated_length": 1143.0, "entropy": 0.314235121011734, "epoch": 4.155448717948718, "frac_reward_zero_std": 0.25, "grad_norm": 377.22088623046875, "learning_rate": 1e-06, "loss": 0.0613, "num_tokens": 1509122212.0, "reward": 0.3483143150806427, "reward_std": 0.027692176401615143, "rewards/progression_diversity/mean": -0.024526942521333694, "rewards/progression_diversity/std": 0.07418946921825409, "rewards/symbolic_reward_accuracy/mean": 0.216796875, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.7308756113052368, "rewards/symbolic_reward_partial_score/std": 0.21280378103256226, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.010713815689087, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 40.72765350341797, "step": 2593 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3067990243434906, "epoch": 4.157051282051282, "grad_norm": 645088.0625, "learning_rate": 1e-06, "loss": 45.8723, "step": 2594 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3317181318998337, "epoch": 4.158653846153846, "grad_norm": 10997.2705078125, "learning_rate": 1e-06, "loss": 1.253, "step": 2595 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3151821494102478, "epoch": 4.160256410256411, "grad_norm": 50563.86328125, "learning_rate": 1e-06, "loss": 9.5746, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.19140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3123.0, "completions/mean_length": 4673.439453125, "completions/mean_terminated_length": 1901.3743896484375, "completions/min_length": 1096.0, "completions/min_terminated_length": 1096.0, "entropy": 0.2506090775132179, "epoch": 4.1618589743589745, "frac_reward_zero_std": 0.0625, "grad_norm": 1328.9156494140625, "learning_rate": 1e-06, "loss": 0.1149, "num_tokens": 1512400453.0, "reward": 0.44010183215141296, "reward_std": 0.06620557606220245, "rewards/progression_diversity/mean": -0.042061757296323776, "rewards/progression_diversity/std": 0.08827082067728043, "rewards/symbolic_reward_accuracy/mean": 0.361328125, "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, "rewards/symbolic_reward_partial_score/mean": 0.7477050423622131, "rewards/symbolic_reward_partial_score/std": 0.23697598278522491, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9721221923828125, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 55.41648864746094, "step": 2597 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.24983707815408707, "epoch": 4.163461538461538, "grad_norm": 100176.0, "learning_rate": 1e-06, "loss": 29.2153, "step": 2598 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2740893214941025, "epoch": 4.165064102564102, "grad_norm": 18222.609375, "learning_rate": 1e-06, "loss": 1.2944, "step": 2599 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3012789934873581, "epoch": 4.166666666666667, "grad_norm": 35.68260192871094, "learning_rate": 1e-06, "loss": 0.0814, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.208984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2746.0, "completions/mean_length": 4911.619140625, "completions/mean_terminated_length": 1880.64453125, "completions/min_length": 1124.0, "completions/min_terminated_length": 1124.0, "entropy": 0.24826913326978683, "epoch": 4.168269230769231, "frac_reward_zero_std": 0.03125, "grad_norm": 513.0524291992188, "learning_rate": 1e-06, "loss": 0.1546, "num_tokens": 1515790834.0, "reward": 0.3871710002422333, "reward_std": 0.04217392951250076, "rewards/progression_diversity/mean": -0.0426679290831089, "rewards/progression_diversity/std": 0.08528563380241394, "rewards/symbolic_reward_accuracy/mean": 0.27734375, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.7399088740348816, "rewards/symbolic_reward_partial_score/std": 0.21892520785331726, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9704339504241943, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 54.93540573120117, "step": 2601 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2412198781967163, "epoch": 4.169871794871795, "grad_norm": 26214.958984375, "learning_rate": 1e-06, "loss": 2.6327, "step": 2602 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.29308077692985535, "epoch": 4.171474358974359, "grad_norm": 19403.22265625, "learning_rate": 1e-06, "loss": 0.3375, "step": 2603 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2676732540130615, "epoch": 4.173076923076923, "grad_norm": 0.013621392659842968, "learning_rate": 1e-06, "loss": 0.1484, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.20703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3097.0, "completions/mean_length": 4899.4921875, "completions/mean_terminated_length": 1901.0738525390625, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "entropy": 0.2361399084329605, "epoch": 4.174679487179487, "frac_reward_zero_std": 0.03125, "grad_norm": 1089.3848876953125, "learning_rate": 1e-06, "loss": 0.1781, "num_tokens": 1519098398.0, "reward": 0.32966169714927673, "reward_std": 0.037758007645606995, "rewards/progression_diversity/mean": -0.039204493165016174, "rewards/progression_diversity/std": 0.07915779203176498, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.731689453125, "rewards/symbolic_reward_partial_score/std": 0.20694516599178314, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9641684889793396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 58.79726028442383, "step": 2605 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.29851216077804565, "epoch": 4.176282051282051, "grad_norm": 0.012859205715358257, "learning_rate": 1e-06, "loss": 0.0952, "step": 2606 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.26364219933748245, "epoch": 4.177884615384615, "grad_norm": 0.01124588306993246, "learning_rate": 1e-06, "loss": 0.1605, "step": 2607 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.25536324083805084, "epoch": 4.17948717948718, "grad_norm": 0.014565329998731613, "learning_rate": 1e-06, "loss": 0.1756, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.263671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3179.0, "completions/mean_length": 5711.21484375, "completions/mean_terminated_length": 1889.3951416015625, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "entropy": 0.2375631034374237, "epoch": 4.181089743589744, "frac_reward_zero_std": 0.0, "grad_norm": 1342.1802978515625, "learning_rate": 1e-06, "loss": 0.1399, "num_tokens": 1522799932.0, "reward": 0.375043660402298, "reward_std": 0.06779703497886658, "rewards/progression_diversity/mean": -0.05374106019735336, "rewards/progression_diversity/std": 0.09237094968557358, "rewards/symbolic_reward_accuracy/mean": 0.265625, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.7239420413970947, "rewards/symbolic_reward_partial_score/std": 0.2347642481327057, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.932857871055603, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 76.66647338867188, "step": 2609 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2648901790380478, "epoch": 4.1826923076923075, "grad_norm": 23.981592178344727, "learning_rate": 1e-06, "loss": 0.1384, "step": 2610 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.2275170087814331, "epoch": 4.184294871794872, "grad_norm": 0.01362849585711956, "learning_rate": 1e-06, "loss": 0.2444, "step": 2611 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.21743150800466537, "epoch": 4.185897435897436, "grad_norm": 0.012125054374337196, "learning_rate": 1e-06, "loss": 0.2923, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 5987.09375, "completions/mean_terminated_length": 1918.7391357421875, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "entropy": 0.24884148687124252, "epoch": 4.1875, "frac_reward_zero_std": 0.0625, "grad_norm": 962.3455810546875, "learning_rate": 1e-06, "loss": 0.1161, "num_tokens": 1526650092.0, "reward": 0.39309823513031006, "reward_std": 0.0952787920832634, "rewards/progression_diversity/mean": -0.05785566568374634, "rewards/progression_diversity/std": 0.09512782096862793, "rewards/symbolic_reward_accuracy/mean": 0.279296875, "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, "rewards/symbolic_reward_partial_score/mean": 0.759521484375, "rewards/symbolic_reward_partial_score/std": 0.2274795025587082, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.934838056564331, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 76.90158081054688, "step": 2613 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.19819628447294235, "epoch": 4.189102564102564, "grad_norm": 79746.578125, "learning_rate": 1e-06, "loss": 0.2315, "step": 2614 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.23601322621107101, "epoch": 4.190705128205128, "grad_norm": 0.25907084345817566, "learning_rate": 1e-06, "loss": 0.1821, "step": 2615 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.23588327318429947, "epoch": 4.1923076923076925, "grad_norm": 0.07316934317350388, "learning_rate": 1e-06, "loss": 0.1802, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.279296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 5958.806640625, "completions/mean_terminated_length": 1918.6910400390625, "completions/min_length": 1163.0, "completions/min_terminated_length": 1163.0, "entropy": 0.21116046607494354, "epoch": 4.193910256410256, "frac_reward_zero_std": 0.0, "grad_norm": 1958.6832275390625, "learning_rate": 1e-06, "loss": 0.1101, "num_tokens": 1530549113.0, "reward": 0.3267575204372406, "reward_std": 0.08475440740585327, "rewards/progression_diversity/mean": -0.06009019538760185, "rewards/progression_diversity/std": 0.09868360310792923, "rewards/symbolic_reward_accuracy/mean": 0.193359375, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.7077311277389526, "rewards/symbolic_reward_partial_score/std": 0.24380625784397125, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9315183758735657, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 78.36473083496094, "step": 2617 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.22109153121709824, "epoch": 4.19551282051282, "grad_norm": 3.5450334548950195, "learning_rate": 1e-06, "loss": 0.2224, "step": 2618 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.24128204584121704, "epoch": 4.197115384615385, "grad_norm": 0.01579132489860058, "learning_rate": 1e-06, "loss": 0.1629, "step": 2619 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.22772854566574097, "epoch": 4.198717948717949, "grad_norm": 0.02087291143834591, "learning_rate": 1e-06, "loss": 0.0817, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.279296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4516.0, "completions/mean_length": 5997.45703125, "completions/mean_terminated_length": 1972.31982421875, "completions/min_length": 1246.0, "completions/min_terminated_length": 1246.0, "entropy": 0.2100074663758278, "epoch": 4.200320512820513, "frac_reward_zero_std": 0.0, "grad_norm": 1239.959228515625, "learning_rate": 1e-06, "loss": 0.1074, "num_tokens": 1534550627.0, "reward": 0.26383617520332336, "reward_std": 0.05979030951857567, "rewards/progression_diversity/mean": -0.06218412518501282, "rewards/progression_diversity/std": 0.1025165319442749, "rewards/symbolic_reward_accuracy/mean": 0.103515625, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.6784017086029053, "rewards/symbolic_reward_partial_score/std": 0.21352458000183105, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9165331125259399, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 87.43069458007812, "step": 2621 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.24080143868923187, "epoch": 4.201923076923077, "grad_norm": 2.7573001384735107, "learning_rate": 1e-06, "loss": 0.1249, "step": 2622 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.21514707058668137, "epoch": 4.203525641025641, "grad_norm": 0.024495212361216545, "learning_rate": 1e-06, "loss": 0.1772, "step": 2623 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.20372025668621063, "epoch": 4.205128205128205, "grad_norm": 0.013811548240482807, "learning_rate": 1e-06, "loss": 0.2065, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3565.0, "completions/mean_length": 6081.05078125, "completions/mean_terminated_length": 1971.131103515625, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "entropy": 0.21161595731973648, "epoch": 4.206730769230769, "frac_reward_zero_std": 0.0, "grad_norm": 856.1641845703125, "learning_rate": 1e-06, "loss": 0.154, "num_tokens": 1538566221.0, "reward": 0.3104988634586334, "reward_std": 0.025025444105267525, "rewards/progression_diversity/mean": -0.06339757144451141, "rewards/progression_diversity/std": 0.10305111110210419, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.7278645634651184, "rewards/symbolic_reward_partial_score/std": 0.1853560358285904, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9161036014556885, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 87.99066925048828, "step": 2625 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.23871337622404099, "epoch": 4.208333333333333, "grad_norm": 36.576744079589844, "learning_rate": 1e-06, "loss": 0.1355, "step": 2626 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.21872205287218094, "epoch": 4.209935897435898, "grad_norm": 27.23736000061035, "learning_rate": 1e-06, "loss": 0.1091, "step": 2627 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.1995127573609352, "epoch": 4.211538461538462, "grad_norm": 0.022180048748850822, "learning_rate": 1e-06, "loss": 0.2113, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.310546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4055.0, "completions/mean_length": 6442.0234375, "completions/mean_terminated_length": 1963.9093017578125, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "entropy": 0.20760582387447357, "epoch": 4.2131410256410255, "frac_reward_zero_std": 0.0, "grad_norm": 3401.139404296875, "learning_rate": 1e-06, "loss": 0.1055, "num_tokens": 1542709657.0, "reward": 0.2375459372997284, "reward_std": 0.05149232968688011, "rewards/progression_diversity/mean": -0.07206659764051437, "rewards/progression_diversity/std": 0.11053474247455597, "rewards/symbolic_reward_accuracy/mean": 0.052734375, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.6946126222610474, "rewards/symbolic_reward_partial_score/std": 0.21639835834503174, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9067912697792053, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 92.28494262695312, "step": 2629 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.1819758117198944, "epoch": 4.214743589743589, "grad_norm": 1448.0584716796875, "learning_rate": 1e-06, "loss": 0.3297, "step": 2630 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.22094269841909409, "epoch": 4.216346153846154, "grad_norm": 0.1524176448583603, "learning_rate": 1e-06, "loss": 0.1131, "step": 2631 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.21019763499498367, "epoch": 4.217948717948718, "grad_norm": 0.015752186998724937, "learning_rate": 1e-06, "loss": 0.1985, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.330078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 6724.53125, "completions/mean_terminated_length": 1965.2012939453125, "completions/min_length": 1115.0, "completions/min_terminated_length": 1115.0, "entropy": 0.19603969156742096, "epoch": 4.219551282051282, "frac_reward_zero_std": 0.0, "grad_norm": 1366.21484375, "learning_rate": 1e-06, "loss": 0.1466, "num_tokens": 1547020169.0, "reward": 0.35085177421569824, "reward_std": 0.07054072618484497, "rewards/progression_diversity/mean": -0.07302698493003845, "rewards/progression_diversity/std": 0.10713056474924088, "rewards/symbolic_reward_accuracy/mean": 0.232421875, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.7110025882720947, "rewards/symbolic_reward_partial_score/std": 0.22401832044124603, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.910980224609375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 90.13082122802734, "step": 2633 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.19943545758724213, "epoch": 4.221153846153846, "grad_norm": 25.857837677001953, "learning_rate": 1e-06, "loss": 0.1302, "step": 2634 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.21115753799676895, "epoch": 4.222756410256411, "grad_norm": 0.014513122849166393, "learning_rate": 1e-06, "loss": 0.1383, "step": 2635 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.1924782246351242, "epoch": 4.2243589743589745, "grad_norm": 0.024281147867441177, "learning_rate": 1e-06, "loss": 0.1726, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.255859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3878.0, "completions/mean_length": 5678.255859375, "completions/mean_terminated_length": 1997.2781982421875, "completions/min_length": 1170.0, "completions/min_terminated_length": 1170.0, "entropy": 0.21415793150663376, "epoch": 4.225961538461538, "frac_reward_zero_std": 0.0, "grad_norm": 1601.75390625, "learning_rate": 1e-06, "loss": 0.0806, "num_tokens": 1550778716.0, "reward": 0.34982889890670776, "reward_std": 0.06202153116464615, "rewards/progression_diversity/mean": -0.05031656473875046, "rewards/progression_diversity/std": 0.0884041115641594, "rewards/symbolic_reward_accuracy/mean": 0.232421875, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.7055338621139526, "rewards/symbolic_reward_partial_score/std": 0.22613964974880219, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9257657527923584, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 81.70069885253906, "step": 2637 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.22891773283481598, "epoch": 4.227564102564102, "grad_norm": 2865.813232421875, "learning_rate": 1e-06, "loss": 0.0946, "step": 2638 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.21290214359760284, "epoch": 4.229166666666667, "grad_norm": 9.31295394897461, "learning_rate": 1e-06, "loss": 0.194, "step": 2639 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2276032641530037, "epoch": 4.230769230769231, "grad_norm": 863.3584594726562, "learning_rate": 1e-06, "loss": 0.2034, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4074.0, "completions/mean_length": 6243.08203125, "completions/mean_terminated_length": 1961.3612060546875, "completions/min_length": 1208.0, "completions/min_terminated_length": 1208.0, "entropy": 0.18027716875076294, "epoch": 4.232371794871795, "frac_reward_zero_std": 0.0, "grad_norm": 1586.920654296875, "learning_rate": 1e-06, "loss": 0.1933, "num_tokens": 1554829782.0, "reward": 0.5201883912086487, "reward_std": 0.06066072732210159, "rewards/progression_diversity/mean": -0.05831076577305794, "rewards/progression_diversity/std": 0.09286186844110489, "rewards/symbolic_reward_accuracy/mean": 0.453125, "rewards/symbolic_reward_accuracy/std": 0.4982847273349762, "rewards/symbolic_reward_partial_score/mean": 0.8316080570220947, "rewards/symbolic_reward_partial_score/std": 0.2016787976026535, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9149261116981506, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 86.6026611328125, "step": 2641 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2154352068901062, "epoch": 4.233974358974359, "grad_norm": 0.13046927750110626, "learning_rate": 1e-06, "loss": 0.114, "step": 2642 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.20094606280326843, "epoch": 4.235576923076923, "grad_norm": 0.010223207995295525, "learning_rate": 1e-06, "loss": 0.1366, "step": 2643 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.20492058992385864, "epoch": 4.237179487179487, "grad_norm": 0.01739051379263401, "learning_rate": 1e-06, "loss": 0.156, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3361.0, "completions/mean_length": 5930.7109375, "completions/mean_terminated_length": 1996.6773681640625, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "entropy": 0.21672135591506958, "epoch": 4.238782051282051, "frac_reward_zero_std": 0.03125, "grad_norm": 1050.203125, "learning_rate": 1e-06, "loss": 0.1747, "num_tokens": 1558679986.0, "reward": 0.4322790503501892, "reward_std": 0.07013079524040222, "rewards/progression_diversity/mean": -0.05236773565411568, "rewards/progression_diversity/std": 0.08915333449840546, "rewards/symbolic_reward_accuracy/mean": 0.35546875, "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, "rewards/symbolic_reward_partial_score/mean": 0.7395508289337158, "rewards/symbolic_reward_partial_score/std": 0.2565183639526367, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9261719584465027, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 78.85722351074219, "step": 2645 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.2054838389158249, "epoch": 4.240384615384615, "grad_norm": 109.77615356445312, "learning_rate": 1e-06, "loss": 0.1925, "step": 2646 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.24229851365089417, "epoch": 4.24198717948718, "grad_norm": 0.40434131026268005, "learning_rate": 1e-06, "loss": 0.1406, "step": 2647 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.21324530988931656, "epoch": 4.243589743589744, "grad_norm": 1434.3251953125, "learning_rate": 1e-06, "loss": 0.2993, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.287109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3740.0, "completions/mean_length": 6136.265625, "completions/mean_terminated_length": 2009.095947265625, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "entropy": 0.20269504189491272, "epoch": 4.2451923076923075, "frac_reward_zero_std": 0.0, "grad_norm": 778.6556396484375, "learning_rate": 1e-06, "loss": 0.1087, "num_tokens": 1562723466.0, "reward": 0.3345257341861725, "reward_std": 0.0690338984131813, "rewards/progression_diversity/mean": -0.054263561964035034, "rewards/progression_diversity/std": 0.08993817865848541, "rewards/symbolic_reward_accuracy/mean": 0.21875, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.6839518547058105, "rewards/symbolic_reward_partial_score/std": 0.21551425755023956, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9316227436065674, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 74.91038513183594, "step": 2649 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.359375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.197452612221241, "epoch": 4.246794871794872, "grad_norm": 7070.578125, "learning_rate": 1e-06, "loss": 0.9423, "step": 2650 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.21947723627090454, "epoch": 4.248397435897436, "grad_norm": 1.046020746231079, "learning_rate": 1e-06, "loss": 0.2003, "step": 2651 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.21606357395648956, "epoch": 4.25, "grad_norm": 0.011754968203604221, "learning_rate": 1e-06, "loss": 0.1609, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4284.0, "completions/mean_length": 5343.4765625, "completions/mean_terminated_length": 2036.923828125, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "entropy": 0.2657729983329773, "epoch": 4.251602564102564, "frac_reward_zero_std": 0.0, "grad_norm": 1515.29736328125, "learning_rate": 1e-06, "loss": 0.0791, "num_tokens": 1566282654.0, "reward": 0.28956276178359985, "reward_std": 0.050334710627794266, "rewards/progression_diversity/mean": -0.04128318279981613, "rewards/progression_diversity/std": 0.08098018169403076, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.7276529669761658, "rewards/symbolic_reward_partial_score/std": 0.1922309845685959, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9493874907493591, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 63.77783203125, "step": 2653 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2197081595659256, "epoch": 4.253205128205128, "grad_norm": 0.02242899127304554, "learning_rate": 1e-06, "loss": 0.192, "step": 2654 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.220329187810421, "epoch": 4.2548076923076925, "grad_norm": 0.013821829110383987, "learning_rate": 1e-06, "loss": 0.1641, "step": 2655 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.21346864849328995, "epoch": 4.256410256410256, "grad_norm": 0.020072348415851593, "learning_rate": 1e-06, "loss": 0.2551, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.279296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4037.0, "completions/mean_length": 6021.244140625, "completions/mean_terminated_length": 2005.3251953125, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "entropy": 0.2017608880996704, "epoch": 4.25801282051282, "frac_reward_zero_std": 0.0, "grad_norm": 1055.4764404296875, "learning_rate": 1e-06, "loss": 0.1218, "num_tokens": 1570270299.0, "reward": 0.3231794834136963, "reward_std": 0.05370699614286423, "rewards/progression_diversity/mean": -0.05509728938341141, "rewards/progression_diversity/std": 0.09356637299060822, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.7047525644302368, "rewards/symbolic_reward_partial_score/std": 0.21620500087738037, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.935107946395874, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 70.60637664794922, "step": 2657 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.23436833918094635, "epoch": 4.259615384615385, "grad_norm": 139.91903686523438, "learning_rate": 1e-06, "loss": 0.1392, "step": 2658 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.1886296272277832, "epoch": 4.261217948717949, "grad_norm": 23.106496810913086, "learning_rate": 1e-06, "loss": 0.2419, "step": 2659 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2260531634092331, "epoch": 4.262820512820513, "grad_norm": 0.19578726589679718, "learning_rate": 1e-06, "loss": 0.1691, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.287109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3542.0, "completions/mean_length": 6165.703125, "completions/mean_terminated_length": 2050.38916015625, "completions/min_length": 1040.0, "completions/min_terminated_length": 1040.0, "entropy": 0.19962295144796371, "epoch": 4.264423076923077, "frac_reward_zero_std": 0.0, "grad_norm": 7102.345703125, "learning_rate": 1e-06, "loss": 0.1947, "num_tokens": 1574379123.0, "reward": 0.31796449422836304, "reward_std": 0.04501751810312271, "rewards/progression_diversity/mean": -0.06097230315208435, "rewards/progression_diversity/std": 0.10001429170370102, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.7155599594116211, "rewards/symbolic_reward_partial_score/std": 0.19974127411842346, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9347620606422424, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 71.3311767578125, "step": 2661 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.20831771194934845, "epoch": 4.266025641025641, "grad_norm": 1.6806011199951172, "learning_rate": 1e-06, "loss": 0.2179, "step": 2662 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.2336948812007904, "epoch": 4.267628205128205, "grad_norm": 0.07298646867275238, "learning_rate": 1e-06, "loss": 0.1278, "step": 2663 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5234375, "entropy": 0.2085075080394745, "epoch": 4.269230769230769, "grad_norm": 0.7998039722442627, "learning_rate": 1e-06, "loss": 0.1634, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.259765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3341.0, "completions/mean_length": 5808.7734375, "completions/mean_terminated_length": 2097.67822265625, "completions/min_length": 1006.0, "completions/min_terminated_length": 1006.0, "entropy": 0.21773767471313477, "epoch": 4.270833333333333, "frac_reward_zero_std": 0.03125, "grad_norm": 1245.658203125, "learning_rate": 1e-06, "loss": 0.1845, "num_tokens": 1578336351.0, "reward": 0.32293805480003357, "reward_std": 0.05607787147164345, "rewards/progression_diversity/mean": -0.048967085778713226, "rewards/progression_diversity/std": 0.0883127897977829, "rewards/symbolic_reward_accuracy/mean": 0.173828125, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.7343424558639526, "rewards/symbolic_reward_partial_score/std": 0.213489830493927, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9438670873641968, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 65.19830322265625, "step": 2665 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.20395781844854355, "epoch": 4.272435897435898, "grad_norm": 1100.1546630859375, "learning_rate": 1e-06, "loss": 0.2518, "step": 2666 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.23760968446731567, "epoch": 4.274038461538462, "grad_norm": 0.022954348474740982, "learning_rate": 1e-06, "loss": 0.0944, "step": 2667 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.22392556816339493, "epoch": 4.2756410256410255, "grad_norm": 0.028759047389030457, "learning_rate": 1e-06, "loss": 0.1897, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.271484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4153.0, "completions/mean_length": 5969.759765625, "completions/mean_terminated_length": 2088.849853515625, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.22678442299365997, "epoch": 4.277243589743589, "frac_reward_zero_std": 0.0625, "grad_norm": 729.8278198242188, "learning_rate": 1e-06, "loss": 0.0574, "num_tokens": 1582333732.0, "reward": 0.2548202872276306, "reward_std": 0.04770761728286743, "rewards/progression_diversity/mean": -0.0428718663752079, "rewards/progression_diversity/std": 0.07706419378519058, "rewards/symbolic_reward_accuracy/mean": 0.107421875, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.6405435800552368, "rewards/symbolic_reward_partial_score/std": 0.21080487966537476, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9503331780433655, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 61.90028381347656, "step": 2669 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.20155585557222366, "epoch": 4.278846153846154, "grad_norm": 0.12458622455596924, "learning_rate": 1e-06, "loss": 0.253, "step": 2670 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.21851865947246552, "epoch": 4.280448717948718, "grad_norm": 0.17406442761421204, "learning_rate": 1e-06, "loss": 0.1624, "step": 2671 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2346644550561905, "epoch": 4.282051282051282, "grad_norm": 0.018945371732115746, "learning_rate": 1e-06, "loss": 0.1564, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3899.0, "completions/mean_length": 5541.857421875, "completions/mean_terminated_length": 2150.212890625, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.2370782345533371, "epoch": 4.283653846153846, "frac_reward_zero_std": 0.0, "grad_norm": 549.9456787109375, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 1585986235.0, "reward": 0.34320884943008423, "reward_std": 0.08159251511096954, "rewards/progression_diversity/mean": -0.03165600448846817, "rewards/progression_diversity/std": 0.06647796928882599, "rewards/symbolic_reward_accuracy/mean": 0.220703125, "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, "rewards/symbolic_reward_partial_score/mean": 0.7088867425918579, "rewards/symbolic_reward_partial_score/std": 0.2290610820055008, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9550345540046692, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 60.594451904296875, "step": 2673 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.21898599714040756, "epoch": 4.285256410256411, "grad_norm": 0.42992645502090454, "learning_rate": 1e-06, "loss": 0.2189, "step": 2674 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.24925994873046875, "epoch": 4.2868589743589745, "grad_norm": 0.020100675523281097, "learning_rate": 1e-06, "loss": 0.0946, "step": 2675 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.2393641099333763, "epoch": 4.288461538461538, "grad_norm": 0.01565798930823803, "learning_rate": 1e-06, "loss": 0.2389, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4642.0, "completions/mean_length": 5307.583984375, "completions/mean_terminated_length": 2276.723876953125, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "entropy": 0.2343580722808838, "epoch": 4.290064102564102, "frac_reward_zero_std": 0.0, "grad_norm": 1280.0311279296875, "learning_rate": 1e-06, "loss": 0.0584, "num_tokens": 1589516614.0, "reward": 0.3476376533508301, "reward_std": 0.125152587890625, "rewards/progression_diversity/mean": -0.03457668796181679, "rewards/progression_diversity/std": 0.08347929269075394, "rewards/symbolic_reward_accuracy/mean": 0.236328125, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.7035644054412842, "rewards/symbolic_reward_partial_score/std": 0.2703304886817932, "rewards/tag_count_reward/mean": -0.048828125, "rewards/tag_count_reward/std": 0.2157193273305893, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9533030986785889, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 62.77830505371094, "step": 2677 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.26719367504119873, "epoch": 4.291666666666667, "grad_norm": 0.26546433568000793, "learning_rate": 1e-06, "loss": 0.1209, "step": 2678 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2331203892827034, "epoch": 4.293269230769231, "grad_norm": 0.01867266371846199, "learning_rate": 1e-06, "loss": 0.2064, "step": 2679 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.23465164005756378, "epoch": 4.294871794871795, "grad_norm": 0.015481102280318737, "learning_rate": 1e-06, "loss": 0.1809, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4966.0, "completions/mean_length": 6191.328125, "completions/mean_terminated_length": 2355.376220703125, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "entropy": 0.18589715659618378, "epoch": 4.296474358974359, "frac_reward_zero_std": 0.0, "grad_norm": 1452.932373046875, "learning_rate": 1e-06, "loss": 0.1798, "num_tokens": 1593491662.0, "reward": 0.35412442684173584, "reward_std": 0.1382516324520111, "rewards/progression_diversity/mean": -0.046052753925323486, "rewards/progression_diversity/std": 0.09779871255159378, "rewards/symbolic_reward_accuracy/mean": 0.2578125, "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, "rewards/symbolic_reward_partial_score/mean": 0.6865071654319763, "rewards/symbolic_reward_partial_score/std": 0.29961350560188293, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.935286283493042, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 69.72281646728516, "step": 2681 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.21021146327257156, "epoch": 4.298076923076923, "grad_norm": 0.04195324704051018, "learning_rate": 1e-06, "loss": 0.1036, "step": 2682 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.20793236792087555, "epoch": 4.299679487179487, "grad_norm": 0.15370796620845795, "learning_rate": 1e-06, "loss": 0.1581, "step": 2683 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.19368456304073334, "epoch": 4.301282051282051, "grad_norm": 0.04016382619738579, "learning_rate": 1e-06, "loss": 0.2206, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4625.0, "completions/mean_length": 5578.798828125, "completions/mean_terminated_length": 2271.084228515625, "completions/min_length": 590.0, "completions/min_terminated_length": 590.0, "entropy": 0.21986397355794907, "epoch": 4.302884615384615, "frac_reward_zero_std": 0.0, "grad_norm": 847.1763305664062, "learning_rate": 1e-06, "loss": 0.1179, "num_tokens": 1597246327.0, "reward": 0.2654598355293274, "reward_std": 0.09309060871601105, "rewards/progression_diversity/mean": -0.039955541491508484, "rewards/progression_diversity/std": 0.08839154243469238, "rewards/symbolic_reward_accuracy/mean": 0.134765625, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.6368489265441895, "rewards/symbolic_reward_partial_score/std": 0.2666816711425781, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.944598376750946, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 68.18759155273438, "step": 2685 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.20748194307088852, "epoch": 4.30448717948718, "grad_norm": 1317.310302734375, "learning_rate": 1e-06, "loss": 0.4452, "step": 2686 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23975148051977158, "epoch": 4.306089743589744, "grad_norm": 0.02449853904545307, "learning_rate": 1e-06, "loss": 0.0999, "step": 2687 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2513374388217926, "epoch": 4.3076923076923075, "grad_norm": 0.0537121407687664, "learning_rate": 1e-06, "loss": 0.1342, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.173828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4082.0, "completions/mean_length": 4746.345703125, "completions/mean_terminated_length": 2297.76123046875, "completions/min_length": 1121.0, "completions/min_terminated_length": 1121.0, "entropy": 0.264044925570488, "epoch": 4.309294871794872, "frac_reward_zero_std": 0.03125, "grad_norm": 532.7546997070312, "learning_rate": 1e-06, "loss": 0.1721, "num_tokens": 1600521912.0, "reward": 0.3210293650627136, "reward_std": 0.10824176669120789, "rewards/progression_diversity/mean": -0.02890094369649887, "rewards/progression_diversity/std": 0.08233704417943954, "rewards/symbolic_reward_accuracy/mean": 0.19140625, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.7064778804779053, "rewards/symbolic_reward_partial_score/std": 0.26733410358428955, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.965470552444458, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 58.45082473754883, "step": 2689 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.390625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.27652691304683685, "epoch": 4.310897435897436, "grad_norm": 2549.2978515625, "learning_rate": 1e-06, "loss": 0.0809, "step": 2690 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.23626501858234406, "epoch": 4.3125, "grad_norm": 25.48175048828125, "learning_rate": 1e-06, "loss": 0.1749, "step": 2691 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.26965607702732086, "epoch": 4.314102564102564, "grad_norm": 21.068880081176758, "learning_rate": 1e-06, "loss": 0.158, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.173828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4066.0, "completions/mean_length": 4708.7734375, "completions/mean_terminated_length": 2252.28369140625, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "entropy": 0.2753334045410156, "epoch": 4.315705128205128, "frac_reward_zero_std": 0.09375, "grad_norm": 2231.534423828125, "learning_rate": 1e-06, "loss": 0.1243, "num_tokens": 1603895476.0, "reward": 0.3245422840118408, "reward_std": 0.10428034514188766, "rewards/progression_diversity/mean": -0.028682753443717957, "rewards/progression_diversity/std": 0.07225596159696579, "rewards/symbolic_reward_accuracy/mean": 0.21875, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.662841796875, "rewards/symbolic_reward_partial_score/std": 0.27776244282722473, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9683549404144287, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 57.83184051513672, "step": 2693 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2380198836326599, "epoch": 4.3173076923076925, "grad_norm": 908.7568969726562, "learning_rate": 1e-06, "loss": 0.1637, "step": 2694 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2474098801612854, "epoch": 4.318910256410256, "grad_norm": 24.16575050354004, "learning_rate": 1e-06, "loss": 0.213, "step": 2695 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.28730684518814087, "epoch": 4.32051282051282, "grad_norm": 0.17024962604045868, "learning_rate": 1e-06, "loss": 0.027, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.17578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5872.0, "completions/mean_length": 4746.83203125, "completions/mean_terminated_length": 2264.9716796875, "completions/min_length": 1135.0, "completions/min_terminated_length": 1135.0, "entropy": 0.25719259679317474, "epoch": 4.322115384615385, "frac_reward_zero_std": 0.0, "grad_norm": 801.6245727539062, "learning_rate": 1e-06, "loss": 0.1019, "num_tokens": 1607156350.0, "reward": 0.30364054441452026, "reward_std": 0.11523503810167313, "rewards/progression_diversity/mean": -0.023154182359576225, "rewards/progression_diversity/std": 0.07085532695055008, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.668505847454071, "rewards/symbolic_reward_partial_score/std": 0.2690269947052002, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9642022848129272, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 60.5496826171875, "step": 2697 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.264162078499794, "epoch": 4.323717948717949, "grad_norm": 7159.71484375, "learning_rate": 1e-06, "loss": 0.5042, "step": 2698 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.2615188807249069, "epoch": 4.325320512820513, "grad_norm": 6776.10302734375, "learning_rate": 1e-06, "loss": 0.2205, "step": 2699 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2697322964668274, "epoch": 4.326923076923077, "grad_norm": 0.10481588542461395, "learning_rate": 1e-06, "loss": 0.1195, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.185546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3512.0, "completions/mean_length": 4892.396484375, "completions/mean_terminated_length": 2274.4052734375, "completions/min_length": 1158.0, "completions/min_terminated_length": 1158.0, "entropy": 0.27855899930000305, "epoch": 4.328525641025641, "frac_reward_zero_std": 0.0, "grad_norm": 877.5675659179688, "learning_rate": 1e-06, "loss": 0.0915, "num_tokens": 1610557369.0, "reward": 0.24516724050045013, "reward_std": 0.09491077810525894, "rewards/progression_diversity/mean": -0.03063952922821045, "rewards/progression_diversity/std": 0.08986818790435791, "rewards/symbolic_reward_accuracy/mean": 0.109375, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.6248860955238342, "rewards/symbolic_reward_partial_score/std": 0.26446953415870667, "rewards/tag_count_reward/mean": -0.076171875, "rewards/tag_count_reward/std": 0.26553234457969666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9703221321105957, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 56.01524353027344, "step": 2701 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2597070336341858, "epoch": 4.330128205128205, "grad_norm": 2198.372802734375, "learning_rate": 1e-06, "loss": 0.3091, "step": 2702 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.26145724952220917, "epoch": 4.331730769230769, "grad_norm": 1281.1771240234375, "learning_rate": 1e-06, "loss": 0.1863, "step": 2703 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2582560032606125, "epoch": 4.333333333333333, "grad_norm": 0.06832489371299744, "learning_rate": 1e-06, "loss": 0.2302, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4234.0, "completions/mean_length": 4208.84765625, "completions/mean_terminated_length": 2280.65625, "completions/min_length": 1218.0, "completions/min_terminated_length": 1218.0, "entropy": 0.2881985604763031, "epoch": 4.334935897435898, "frac_reward_zero_std": 0.03125, "grad_norm": 2649.09423828125, "learning_rate": 1e-06, "loss": 0.2109, "num_tokens": 1613462507.0, "reward": 0.3500940799713135, "reward_std": 0.11316157132387161, "rewards/progression_diversity/mean": -0.019399691373109818, "rewards/progression_diversity/std": 0.07821278274059296, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.6936686038970947, "rewards/symbolic_reward_partial_score/std": 0.2801879942417145, "rewards/tag_count_reward/mean": -0.0546875, "rewards/tag_count_reward/std": 0.2275916188955307, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9930620193481445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 43.674652099609375, "step": 2705 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.28035198152065277, "epoch": 4.336538461538462, "grad_norm": 6.077508449554443, "learning_rate": 1e-06, "loss": 0.1302, "step": 2706 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.29166266322135925, "epoch": 4.3381410256410255, "grad_norm": 0.6038011908531189, "learning_rate": 1e-06, "loss": 0.1481, "step": 2707 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.28930486738681793, "epoch": 4.339743589743589, "grad_norm": 0.5783957839012146, "learning_rate": 1e-06, "loss": 0.1213, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4111.0, "completions/mean_length": 4497.861328125, "completions/mean_terminated_length": 2296.724609375, "completions/min_length": 1329.0, "completions/min_terminated_length": 1329.0, "entropy": 0.2855447679758072, "epoch": 4.341346153846154, "frac_reward_zero_std": 0.0, "grad_norm": 770.3568725585938, "learning_rate": 1e-06, "loss": 0.1363, "num_tokens": 1616717908.0, "reward": 0.2790448069572449, "reward_std": 0.11913672089576721, "rewards/progression_diversity/mean": -0.018859868869185448, "rewards/progression_diversity/std": 0.06757510453462601, "rewards/symbolic_reward_accuracy/mean": 0.134765625, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.6872884035110474, "rewards/symbolic_reward_partial_score/std": 0.2816750705242157, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9902298450469971, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 45.65616989135742, "step": 2709 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.26931584626436234, "epoch": 4.342948717948718, "grad_norm": 456.98199462890625, "learning_rate": 1e-06, "loss": 0.2891, "step": 2710 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2959611713886261, "epoch": 4.344551282051282, "grad_norm": 6.0887250900268555, "learning_rate": 1e-06, "loss": 0.7324, "step": 2711 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2712084949016571, "epoch": 4.346153846153846, "grad_norm": 61.22941970825195, "learning_rate": 1e-06, "loss": 0.2727, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4333.0, "completions/mean_length": 4048.97265625, "completions/mean_terminated_length": 2286.825927734375, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "entropy": 0.2950481027364731, "epoch": 4.347756410256411, "frac_reward_zero_std": 0.03125, "grad_norm": 1410.4874267578125, "learning_rate": 1e-06, "loss": 0.1737, "num_tokens": 1619610758.0, "reward": 0.28769299387931824, "reward_std": 0.09948226064443588, "rewards/progression_diversity/mean": -0.012928030453622341, "rewards/progression_diversity/std": 0.05409675091505051, "rewards/symbolic_reward_accuracy/mean": 0.1640625, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.6508138179779053, "rewards/symbolic_reward_partial_score/std": 0.267758846282959, "rewards/tag_count_reward/mean": -0.05859375, "rewards/tag_count_reward/std": 0.23509246110916138, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0013058185577393, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 40.96929931640625, "step": 2713 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3671875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3074454814195633, "epoch": 4.3493589743589745, "grad_norm": 1392.9722900390625, "learning_rate": 1e-06, "loss": 0.1584, "step": 2714 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.28932228684425354, "epoch": 4.350961538461538, "grad_norm": 12.400731086730957, "learning_rate": 1e-06, "loss": 0.5763, "step": 2715 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.28287215530872345, "epoch": 4.352564102564102, "grad_norm": 488.6679382324219, "learning_rate": 1e-06, "loss": 0.2059, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.142578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4276.0, "completions/mean_length": 4305.140625, "completions/mean_terminated_length": 2296.583251953125, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "entropy": 0.2647574245929718, "epoch": 4.354166666666667, "frac_reward_zero_std": 0.0625, "grad_norm": 1532.6412353515625, "learning_rate": 1e-06, "loss": 0.247, "num_tokens": 1622678990.0, "reward": 0.26650041341781616, "reward_std": 0.10990884155035019, "rewards/progression_diversity/mean": -0.019881153479218483, "rewards/progression_diversity/std": 0.07622101157903671, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.6494140625, "rewards/symbolic_reward_partial_score/std": 0.2761240601539612, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9932198524475098, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 43.36308288574219, "step": 2717 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.30068182945251465, "epoch": 4.355769230769231, "grad_norm": 9.29902172088623, "learning_rate": 1e-06, "loss": 0.137, "step": 2718 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2581791281700134, "epoch": 4.357371794871795, "grad_norm": 1740.7247314453125, "learning_rate": 1e-06, "loss": 0.235, "step": 2719 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.29872237145900726, "epoch": 4.358974358974359, "grad_norm": 24.941686630249023, "learning_rate": 1e-06, "loss": 0.1233, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.138671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5111.0, "completions/mean_length": 4246.28515625, "completions/mean_terminated_length": 2292.140625, "completions/min_length": 1088.0, "completions/min_terminated_length": 1088.0, "entropy": 0.3010956943035126, "epoch": 4.360576923076923, "frac_reward_zero_std": 0.09375, "grad_norm": 2546.631103515625, "learning_rate": 1e-06, "loss": 0.0579, "num_tokens": 1625667760.0, "reward": 0.2679121792316437, "reward_std": 0.097173772752285, "rewards/progression_diversity/mean": -0.019818993285298347, "rewards/progression_diversity/std": 0.07718969136476517, "rewards/symbolic_reward_accuracy/mean": 0.140625, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.6358886957168579, "rewards/symbolic_reward_partial_score/std": 0.26983723044395447, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0032100677490234, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 38.43324279785156, "step": 2721 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2788269519805908, "epoch": 4.362179487179487, "grad_norm": 0.07314433157444, "learning_rate": 1e-06, "loss": 0.1801, "step": 2722 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2793916165828705, "epoch": 4.363782051282051, "grad_norm": 0.012875649146735668, "learning_rate": 1e-06, "loss": 0.179, "step": 2723 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2723848670721054, "epoch": 4.365384615384615, "grad_norm": 0.053210750222206116, "learning_rate": 1e-06, "loss": 0.2149, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.177734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5363.0, "completions/mean_length": 4823.73046875, "completions/mean_terminated_length": 2324.955078125, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "entropy": 0.26607632637023926, "epoch": 4.36698717948718, "frac_reward_zero_std": 0.03125, "grad_norm": 536.0072631835938, "learning_rate": 1e-06, "loss": 0.1797, "num_tokens": 1629215734.0, "reward": 0.19067470729351044, "reward_std": 0.1059282124042511, "rewards/progression_diversity/mean": -0.025302495807409286, "rewards/progression_diversity/std": 0.08640986680984497, "rewards/symbolic_reward_accuracy/mean": 0.060546875, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.5459309816360474, "rewards/symbolic_reward_partial_score/std": 0.26790010929107666, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9844135046005249, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 45.332759857177734, "step": 2725 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2631705701351166, "epoch": 4.368589743589744, "grad_norm": 1414.080810546875, "learning_rate": 1e-06, "loss": 0.3588, "step": 2726 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.23933739215135574, "epoch": 4.3701923076923075, "grad_norm": 841.8580932617188, "learning_rate": 1e-06, "loss": 0.3975, "step": 2727 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2838355749845505, "epoch": 4.371794871794872, "grad_norm": 46.018558502197266, "learning_rate": 1e-06, "loss": 0.133, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5066.0, "completions/mean_length": 4252.61328125, "completions/mean_terminated_length": 2331.35302734375, "completions/min_length": 1241.0, "completions/min_terminated_length": 1241.0, "entropy": 0.2777034640312195, "epoch": 4.373397435897436, "frac_reward_zero_std": 0.03125, "grad_norm": 674.3890991210938, "learning_rate": 1e-06, "loss": 0.1737, "num_tokens": 1632154656.0, "reward": 0.3058478832244873, "reward_std": 0.1194763332605362, "rewards/progression_diversity/mean": -0.017753848806023598, "rewards/progression_diversity/std": 0.07137567549943924, "rewards/symbolic_reward_accuracy/mean": 0.197265625, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.6541991829872131, "rewards/symbolic_reward_partial_score/std": 0.2857528626918793, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.001778483390808, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 38.360496520996094, "step": 2729 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3052513748407364, "epoch": 4.375, "grad_norm": 153.75807189941406, "learning_rate": 1e-06, "loss": 0.0824, "step": 2730 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2919474095106125, "epoch": 4.376602564102564, "grad_norm": 0.023059062659740448, "learning_rate": 1e-06, "loss": 0.2305, "step": 2731 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.29401521384716034, "epoch": 4.378205128205128, "grad_norm": 0.014374534599483013, "learning_rate": 1e-06, "loss": 0.2279, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4636.0, "completions/mean_length": 4243.36328125, "completions/mean_terminated_length": 2320.63818359375, "completions/min_length": 1151.0, "completions/min_terminated_length": 1151.0, "entropy": 0.30612772703170776, "epoch": 4.3798076923076925, "frac_reward_zero_std": 0.0625, "grad_norm": 1426.8148193359375, "learning_rate": 1e-06, "loss": 0.1158, "num_tokens": 1635162922.0, "reward": 0.23149944841861725, "reward_std": 0.09988817572593689, "rewards/progression_diversity/mean": -0.018513914197683334, "rewards/progression_diversity/std": 0.07170087099075317, "rewards/symbolic_reward_accuracy/mean": 0.1015625, "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, "rewards/symbolic_reward_partial_score/mean": 0.5978027582168579, "rewards/symbolic_reward_partial_score/std": 0.2723853290081024, "rewards/tag_count_reward/mean": -0.0859375, "rewards/tag_count_reward/std": 0.28054583072662354, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0040245056152344, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 38.19234848022461, "step": 2733 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.277388796210289, "epoch": 4.381410256410256, "grad_norm": 0.028441831469535828, "learning_rate": 1e-06, "loss": 0.2186, "step": 2734 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.29677003622055054, "epoch": 4.38301282051282, "grad_norm": 0.014668729156255722, "learning_rate": 1e-06, "loss": 0.117, "step": 2735 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.28144995868206024, "epoch": 4.384615384615385, "grad_norm": 0.014569677412509918, "learning_rate": 1e-06, "loss": 0.219, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.130859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3800.0, "completions/mean_length": 4181.333984375, "completions/mean_terminated_length": 2344.07861328125, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "entropy": 0.277862548828125, "epoch": 4.386217948717949, "frac_reward_zero_std": 0.03125, "grad_norm": 822.4926147460938, "learning_rate": 1e-06, "loss": 0.2072, "num_tokens": 1638169845.0, "reward": 0.3534255623817444, "reward_std": 0.12979258596897125, "rewards/progression_diversity/mean": -0.015356136485934258, "rewards/progression_diversity/std": 0.0585295706987381, "rewards/symbolic_reward_accuracy/mean": 0.255859375, "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, "rewards/symbolic_reward_partial_score/mean": 0.6857584714889526, "rewards/symbolic_reward_partial_score/std": 0.2829958498477936, "rewards/tag_count_reward/mean": -0.056640625, "rewards/tag_count_reward/std": 0.23138070106506348, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9974150657653809, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 42.795005798339844, "step": 2737 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3079565465450287, "epoch": 4.387820512820513, "grad_norm": 3624.53125, "learning_rate": 1e-06, "loss": 0.4683, "step": 2738 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2956803888082504, "epoch": 4.389423076923077, "grad_norm": 605.93310546875, "learning_rate": 1e-06, "loss": 0.216, "step": 2739 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.2977532297372818, "epoch": 4.391025641025641, "grad_norm": 0.01604938693344593, "learning_rate": 1e-06, "loss": 0.1553, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3997.0, "completions/mean_length": 3936.32421875, "completions/mean_terminated_length": 2315.103759765625, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "entropy": 0.288834884762764, "epoch": 4.392628205128205, "frac_reward_zero_std": 0.09375, "grad_norm": 13779.6708984375, "learning_rate": 1e-06, "loss": 0.2399, "num_tokens": 1641167563.0, "reward": 0.19894778728485107, "reward_std": 0.08240419626235962, "rewards/progression_diversity/mean": -0.016354724764823914, "rewards/progression_diversity/std": 0.06330784410238266, "rewards/symbolic_reward_accuracy/mean": 0.056640625, "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, "rewards/symbolic_reward_partial_score/mean": 0.5764648914337158, "rewards/symbolic_reward_partial_score/std": 0.26688218116760254, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.010166883468628, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 37.03101348876953, "step": 2741 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3093618303537369, "epoch": 4.394230769230769, "grad_norm": 40587.7109375, "learning_rate": 1e-06, "loss": 2.4703, "step": 2742 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.31586118042469025, "epoch": 4.395833333333333, "grad_norm": 1233050.0, "learning_rate": 1e-06, "loss": 117.5929, "step": 2743 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.29217708110809326, "epoch": 4.397435897435898, "grad_norm": 0.014232386834919453, "learning_rate": 1e-06, "loss": 0.1765, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4179.0, "completions/mean_length": 3859.9765625, "completions/mean_terminated_length": 2352.708984375, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.2802654653787613, "epoch": 4.399038461538462, "frac_reward_zero_std": 0.09375, "grad_norm": 1434.6417236328125, "learning_rate": 1e-06, "loss": 0.2646, "num_tokens": 1644092767.0, "reward": 0.2843579947948456, "reward_std": 0.11489598453044891, "rewards/progression_diversity/mean": -0.01488535013049841, "rewards/progression_diversity/std": 0.06615662574768066, "rewards/symbolic_reward_accuracy/mean": 0.166015625, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.6384602785110474, "rewards/symbolic_reward_partial_score/std": 0.2716268301010132, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0159142017364502, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 34.84583282470703, "step": 2745 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3136250674724579, "epoch": 4.4006410256410255, "grad_norm": 4538.53564453125, "learning_rate": 1e-06, "loss": 0.6472, "step": 2746 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3199605792760849, "epoch": 4.402243589743589, "grad_norm": 3800.058349609375, "learning_rate": 1e-06, "loss": 0.4546, "step": 2747 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.31357628107070923, "epoch": 4.403846153846154, "grad_norm": 11993.4921875, "learning_rate": 1e-06, "loss": 1.4673, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.126953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4939.0, "completions/mean_length": 4187.60546875, "completions/mean_terminated_length": 2414.08056640625, "completions/min_length": 1315.0, "completions/min_terminated_length": 1315.0, "entropy": 0.30130642652511597, "epoch": 4.405448717948718, "frac_reward_zero_std": 0.0625, "grad_norm": 1137.6595458984375, "learning_rate": 1e-06, "loss": 0.075, "num_tokens": 1647127509.0, "reward": 0.23044368624687195, "reward_std": 0.11951728165149689, "rewards/progression_diversity/mean": -0.023503467440605164, "rewards/progression_diversity/std": 0.08980035781860352, "rewards/symbolic_reward_accuracy/mean": 0.09765625, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.597705066204071, "rewards/symbolic_reward_partial_score/std": 0.25887858867645264, "rewards/tag_count_reward/mean": -0.072265625, "rewards/tag_count_reward/std": 0.2591804563999176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0048857927322388, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 40.13702392578125, "step": 2749 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.29961445927619934, "epoch": 4.407051282051282, "grad_norm": 0.05410754680633545, "learning_rate": 1e-06, "loss": 0.1237, "step": 2750 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3096155524253845, "epoch": 4.408653846153846, "grad_norm": 0.017144974321126938, "learning_rate": 1e-06, "loss": 0.1487, "step": 2751 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2930636405944824, "epoch": 4.410256410256411, "grad_norm": 0.6245795488357544, "learning_rate": 1e-06, "loss": 0.184, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4303.0, "completions/mean_length": 3234.2734375, "completions/mean_terminated_length": 2328.342529296875, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "entropy": 0.3245136886835098, "epoch": 4.4118589743589745, "frac_reward_zero_std": 0.0625, "grad_norm": 1545.5665283203125, "learning_rate": 1e-06, "loss": 0.1395, "num_tokens": 1649663425.0, "reward": 0.3059009313583374, "reward_std": 0.08847295492887497, "rewards/progression_diversity/mean": -0.014401247724890709, "rewards/progression_diversity/std": 0.06953856348991394, "rewards/symbolic_reward_accuracy/mean": 0.171875, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.6913737058639526, "rewards/symbolic_reward_partial_score/std": 0.24967017769813538, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0390673875808716, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 25.484981536865234, "step": 2753 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.31169527769088745, "epoch": 4.413461538461538, "grad_norm": 0.02673480287194252, "learning_rate": 1e-06, "loss": 0.1752, "step": 2754 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3431106209754944, "epoch": 4.415064102564102, "grad_norm": 0.0156412310898304, "learning_rate": 1e-06, "loss": 0.0603, "step": 2755 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3379392623901367, "epoch": 4.416666666666667, "grad_norm": 0.03630439192056656, "learning_rate": 1e-06, "loss": 0.0386, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4258.0, "completions/mean_length": 2953.6015625, "completions/mean_terminated_length": 2205.929931640625, "completions/min_length": 1181.0, "completions/min_terminated_length": 1181.0, "entropy": 0.33529841899871826, "epoch": 4.418269230769231, "frac_reward_zero_std": 0.09375, "grad_norm": 1840.3985595703125, "learning_rate": 1e-06, "loss": 0.0718, "num_tokens": 1652110821.0, "reward": 0.27117669582366943, "reward_std": 0.089139424264431, "rewards/progression_diversity/mean": -0.014164266176521778, "rewards/progression_diversity/std": 0.07661490887403488, "rewards/symbolic_reward_accuracy/mean": 0.12109375, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.6706705689430237, "rewards/symbolic_reward_partial_score/std": 0.23558108508586884, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.038825273513794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 25.822206497192383, "step": 2757 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3324403911828995, "epoch": 4.419871794871795, "grad_norm": 0.03022000938653946, "learning_rate": 1e-06, "loss": 0.0187, "step": 2758 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.32936082780361176, "epoch": 4.421474358974359, "grad_norm": 0.026491805911064148, "learning_rate": 1e-06, "loss": 0.1138, "step": 2759 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3215688169002533, "epoch": 4.423076923076923, "grad_norm": 0.019565416499972343, "learning_rate": 1e-06, "loss": 0.1158, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4371.0, "completions/mean_length": 2875.587890625, "completions/mean_terminated_length": 2211.23974609375, "completions/min_length": 1132.0, "completions/min_terminated_length": 1132.0, "entropy": 0.34050194919109344, "epoch": 4.424679487179487, "frac_reward_zero_std": 0.15625, "grad_norm": 462.5588684082031, "learning_rate": 1e-06, "loss": 0.0736, "num_tokens": 1654529730.0, "reward": 0.3495715260505676, "reward_std": 0.09474267810583115, "rewards/progression_diversity/mean": -0.010134847834706306, "rewards/progression_diversity/std": 0.061041031032800674, "rewards/symbolic_reward_accuracy/mean": 0.22265625, "rewards/symbolic_reward_accuracy/std": 0.41643625497817993, "rewards/symbolic_reward_partial_score/mean": 0.7293782830238342, "rewards/symbolic_reward_partial_score/std": 0.21722783148288727, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0413200855255127, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 24.018436431884766, "step": 2761 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3244868367910385, "epoch": 4.426282051282051, "grad_norm": 0.03473278507590294, "learning_rate": 1e-06, "loss": 0.0975, "step": 2762 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3279813975095749, "epoch": 4.427884615384615, "grad_norm": 0.02159891277551651, "learning_rate": 1e-06, "loss": 0.0595, "step": 2763 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.33927109837532043, "epoch": 4.42948717948718, "grad_norm": 0.017984116449952126, "learning_rate": 1e-06, "loss": 0.0433, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3670.0, "completions/mean_length": 2952.734375, "completions/mean_terminated_length": 2116.763671875, "completions/min_length": 1072.0, "completions/min_terminated_length": 1072.0, "entropy": 0.3449130952358246, "epoch": 4.431089743589744, "frac_reward_zero_std": 0.1875, "grad_norm": 0.01726151816546917, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 1656940938.0, "reward": 0.3437863886356354, "reward_std": 0.07033580541610718, "rewards/progression_diversity/mean": -0.011987379752099514, "rewards/progression_diversity/std": 0.059770114719867706, "rewards/symbolic_reward_accuracy/mean": 0.21875, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.7179687023162842, "rewards/symbolic_reward_partial_score/std": 0.22868217527866364, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.029953956604004, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 30.53244400024414, "step": 2765 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.33045631647109985, "epoch": 4.4326923076923075, "grad_norm": 1069.1544189453125, "learning_rate": 1e-06, "loss": 0.1185, "step": 2766 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3347640484571457, "epoch": 4.434294871794872, "grad_norm": 0.23746109008789062, "learning_rate": 1e-06, "loss": 0.0323, "step": 2767 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.3243960440158844, "epoch": 4.435897435897436, "grad_norm": 0.02076563611626625, "learning_rate": 1e-06, "loss": 0.1661, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.072265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3358.0, "completions/mean_length": 3165.107421875, "completions/mean_terminated_length": 2135.425048828125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.3194960504770279, "epoch": 4.4375, "frac_reward_zero_std": 0.1875, "grad_norm": 1015.4884643554688, "learning_rate": 1e-06, "loss": 0.1141, "num_tokens": 1659435105.0, "reward": 0.3932766914367676, "reward_std": 0.10632849484682083, "rewards/progression_diversity/mean": -0.015594206750392914, "rewards/progression_diversity/std": 0.06812360882759094, "rewards/symbolic_reward_accuracy/mean": 0.283203125, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.7548013925552368, "rewards/symbolic_reward_partial_score/std": 0.23602750897407532, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0133789777755737, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 38.01353073120117, "step": 2769 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3396587520837784, "epoch": 4.439102564102564, "grad_norm": 615.4129638671875, "learning_rate": 1e-06, "loss": 0.0409, "step": 2770 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3090101033449173, "epoch": 4.440705128205128, "grad_norm": 1137.1129150390625, "learning_rate": 1e-06, "loss": 0.1933, "step": 2771 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3358742445707321, "epoch": 4.4423076923076925, "grad_norm": 4.300602912902832, "learning_rate": 1e-06, "loss": 0.1023, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3278.0, "completions/mean_length": 3125.611328125, "completions/mean_terminated_length": 2122.876220703125, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "entropy": 0.3427940756082535, "epoch": 4.443910256410256, "frac_reward_zero_std": 0.125, "grad_norm": 407.4725341796875, "learning_rate": 1e-06, "loss": 0.0307, "num_tokens": 1661810682.0, "reward": 0.35953429341316223, "reward_std": 0.1048397421836853, "rewards/progression_diversity/mean": -0.011415429413318634, "rewards/progression_diversity/std": 0.053802087903022766, "rewards/symbolic_reward_accuracy/mean": 0.25, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.7059895396232605, "rewards/symbolic_reward_partial_score/std": 0.25418156385421753, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.019977331161499, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 34.17755889892578, "step": 2773 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3142138421535492, "epoch": 4.44551282051282, "grad_norm": 6694.47998046875, "learning_rate": 1e-06, "loss": 0.4636, "step": 2774 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3414735943078995, "epoch": 4.447115384615385, "grad_norm": 879.877685546875, "learning_rate": 1e-06, "loss": 0.0436, "step": 2775 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.330064132809639, "epoch": 4.448717948717949, "grad_norm": 0.05605228990316391, "learning_rate": 1e-06, "loss": 0.0995, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4448.0, "completions/mean_length": 3544.04296875, "completions/mean_terminated_length": 2123.570556640625, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "entropy": 0.3042691648006439, "epoch": 4.450320512820513, "frac_reward_zero_std": 0.03125, "grad_norm": 936.56103515625, "learning_rate": 1e-06, "loss": 0.1016, "num_tokens": 1664474448.0, "reward": 0.36131882667541504, "reward_std": 0.11240831017494202, "rewards/progression_diversity/mean": -0.013626575469970703, "rewards/progression_diversity/std": 0.05417032539844513, "rewards/symbolic_reward_accuracy/mean": 0.248046875, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.7146159410476685, "rewards/symbolic_reward_partial_score/std": 0.23360253870487213, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9984034299850464, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 43.34300994873047, "step": 2777 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.31309716403484344, "epoch": 4.451923076923077, "grad_norm": 12874.2177734375, "learning_rate": 1e-06, "loss": 0.5435, "step": 2778 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3106722980737686, "epoch": 4.453525641025641, "grad_norm": 0.4278506338596344, "learning_rate": 1e-06, "loss": 0.0563, "step": 2779 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.307477205991745, "epoch": 4.455128205128205, "grad_norm": 0.03869732841849327, "learning_rate": 1e-06, "loss": 0.0845, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3998.0, "completions/mean_length": 2993.6328125, "completions/mean_terminated_length": 2100.94189453125, "completions/min_length": 1022.0, "completions/min_terminated_length": 1022.0, "entropy": 0.33481790125370026, "epoch": 4.456730769230769, "frac_reward_zero_std": 0.0625, "grad_norm": 260.7884826660156, "learning_rate": 1e-06, "loss": 0.0412, "num_tokens": 1666869284.0, "reward": 0.3834180235862732, "reward_std": 0.1236167624592781, "rewards/progression_diversity/mean": -0.010737746953964233, "rewards/progression_diversity/std": 0.04949859157204628, "rewards/symbolic_reward_accuracy/mean": 0.267578125, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.7504231929779053, "rewards/symbolic_reward_partial_score/std": 0.22556151449680328, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0300805568695068, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 27.265310287475586, "step": 2781 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3359375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.31526750326156616, "epoch": 4.458333333333333, "grad_norm": 7211.5888671875, "learning_rate": 1e-06, "loss": 0.8517, "step": 2782 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.329397588968277, "epoch": 4.459935897435898, "grad_norm": 52.46617126464844, "learning_rate": 1e-06, "loss": 0.0521, "step": 2783 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.329983115196228, "epoch": 4.461538461538462, "grad_norm": 2546.868896484375, "learning_rate": 1e-06, "loss": 0.2009, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.041015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3777.0, "completions/mean_length": 2726.880859375, "completions/mean_terminated_length": 2142.767822265625, "completions/min_length": 1010.0, "completions/min_terminated_length": 1010.0, "entropy": 0.32919782400131226, "epoch": 4.4631410256410255, "frac_reward_zero_std": 0.15625, "grad_norm": 794.0968017578125, "learning_rate": 1e-06, "loss": 0.0914, "num_tokens": 1669120647.0, "reward": 0.3136161267757416, "reward_std": 0.07102187722921371, "rewards/progression_diversity/mean": -0.006063465960323811, "rewards/progression_diversity/std": 0.034248657524585724, "rewards/symbolic_reward_accuracy/mean": 0.17578125, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.7024902105331421, "rewards/symbolic_reward_partial_score/std": 0.2275681048631668, "rewards/tag_count_reward/mean": -0.025390625, "rewards/tag_count_reward/std": 0.15746226906776428, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.046392560005188, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 19.699295043945312, "step": 2785 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3417544662952423, "epoch": 4.464743589743589, "grad_norm": 46041.80078125, "learning_rate": 1e-06, "loss": 0.8061, "step": 2786 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.35066042840480804, "epoch": 4.466346153846154, "grad_norm": 0.02314118668437004, "learning_rate": 1e-06, "loss": 0.0297, "step": 2787 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.34087491035461426, "epoch": 4.467948717948718, "grad_norm": 5573.73779296875, "learning_rate": 1e-06, "loss": 0.1796, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3409.0, "completions/mean_length": 2916.076171875, "completions/mean_terminated_length": 2195.56982421875, "completions/min_length": 1118.0, "completions/min_terminated_length": 1118.0, "entropy": 0.34819819033145905, "epoch": 4.469551282051282, "frac_reward_zero_std": 0.28125, "grad_norm": 49.569732666015625, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 1671554350.0, "reward": 0.4041130542755127, "reward_std": 0.06181896850466728, "rewards/progression_diversity/mean": -0.005198919214308262, "rewards/progression_diversity/std": 0.02956199087202549, "rewards/symbolic_reward_accuracy/mean": 0.3046875, "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, "rewards/symbolic_reward_partial_score/mean": 0.7482584714889526, "rewards/symbolic_reward_partial_score/std": 0.23627658188343048, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.037619948387146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 22.79637908935547, "step": 2789 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3270893096923828, "epoch": 4.471153846153846, "grad_norm": 3314.6259765625, "learning_rate": 1e-06, "loss": 4.353, "step": 2790 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.31063470244407654, "epoch": 4.472756410256411, "grad_norm": 5541.47216796875, "learning_rate": 1e-06, "loss": 0.4331, "step": 2791 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3323570489883423, "epoch": 4.4743589743589745, "grad_norm": 0.013452508486807346, "learning_rate": 1e-06, "loss": 0.0806, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4269.0, "completions/mean_length": 2823.076171875, "completions/mean_terminated_length": 2185.2412109375, "completions/min_length": 677.0, "completions/min_terminated_length": 677.0, "entropy": 0.35660889744758606, "epoch": 4.475961538461538, "frac_reward_zero_std": 0.3125, "grad_norm": 459.7804260253906, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 1673850565.0, "reward": 0.3343041241168976, "reward_std": 0.0574406236410141, "rewards/progression_diversity/mean": -0.001719512976706028, "rewards/progression_diversity/std": 0.017452143132686615, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.6970865726470947, "rewards/symbolic_reward_partial_score/std": 0.22637033462524414, "rewards/tag_count_reward/mean": -0.013671875, "rewards/tag_count_reward/std": 0.1162383034825325, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0488998889923096, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 17.94207000732422, "step": 2793 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.340995192527771, "epoch": 4.477564102564102, "grad_norm": 0.02110283449292183, "learning_rate": 1e-06, "loss": 0.0215, "step": 2794 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.331098347902298, "epoch": 4.479166666666667, "grad_norm": 0.03017670288681984, "learning_rate": 1e-06, "loss": 0.0531, "step": 2795 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.33193665742874146, "epoch": 4.480769230769231, "grad_norm": 0.07421034574508667, "learning_rate": 1e-06, "loss": 0.0802, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4440.0, "completions/mean_length": 2845.490234375, "completions/mean_terminated_length": 2179.661865234375, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "entropy": 0.334924578666687, "epoch": 4.482371794871795, "frac_reward_zero_std": 0.375, "grad_norm": 300.2858581542969, "learning_rate": 1e-06, "loss": 0.0756, "num_tokens": 1676160880.0, "reward": 0.3624279201030731, "reward_std": 0.06554090976715088, "rewards/progression_diversity/mean": -0.0008624562760815024, "rewards/progression_diversity/std": 0.009423964656889439, "rewards/symbolic_reward_accuracy/mean": 0.244140625, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.7289550304412842, "rewards/symbolic_reward_partial_score/std": 0.23089763522148132, "rewards/tag_count_reward/mean": -0.02734375, "rewards/tag_count_reward/std": 0.16324250400066376, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0452535152435303, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 16.53045654296875, "step": 2797 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3460453897714615, "epoch": 4.483974358974359, "grad_norm": 0.013277917169034481, "learning_rate": 1e-06, "loss": 0.0197, "step": 2798 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3399341404438019, "epoch": 4.485576923076923, "grad_norm": 0.018666263669729233, "learning_rate": 1e-06, "loss": 0.078, "step": 2799 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.32549184560775757, "epoch": 4.487179487179487, "grad_norm": 0.0136836227029562, "learning_rate": 1e-06, "loss": 0.1256, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3747.0, "completions/mean_length": 2939.9921875, "completions/mean_terminated_length": 2162.239501953125, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "entropy": 0.3398710787296295, "epoch": 4.488782051282051, "frac_reward_zero_std": 0.34375, "grad_norm": 727.4561157226562, "learning_rate": 1e-06, "loss": 0.0565, "num_tokens": 1678543564.0, "reward": 0.2560884952545166, "reward_std": 0.05889653041958809, "rewards/progression_diversity/mean": -3.998234387836419e-05, "rewards/progression_diversity/std": 0.0009046972263604403, "rewards/symbolic_reward_accuracy/mean": 0.115234375, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.6348795294761658, "rewards/symbolic_reward_partial_score/std": 0.23259754478931427, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0552442073822021, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 7.984652519226074, "step": 2801 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.33726924657821655, "epoch": 4.490384615384615, "grad_norm": 0.024799533188343048, "learning_rate": 1e-06, "loss": 0.0782, "step": 2802 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3415806442499161, "epoch": 4.49198717948718, "grad_norm": 0.01905086636543274, "learning_rate": 1e-06, "loss": 0.05, "step": 2803 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.32189974188804626, "epoch": 4.493589743589744, "grad_norm": 0.01600506529211998, "learning_rate": 1e-06, "loss": 0.1482, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4592.0, "completions/mean_length": 2884.654296875, "completions/mean_terminated_length": 2191.66943359375, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "entropy": 0.33240219950675964, "epoch": 4.4951923076923075, "frac_reward_zero_std": 0.3125, "grad_norm": 1222.6678466796875, "learning_rate": 1e-06, "loss": 0.065, "num_tokens": 1680954971.0, "reward": 0.2994934320449829, "reward_std": 0.05192206799983978, "rewards/progression_diversity/mean": -0.00036605086643248796, "rewards/progression_diversity/std": 0.004469391889870167, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.7053548097610474, "rewards/symbolic_reward_partial_score/std": 0.2314625382423401, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0502550601959229, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 13.556268692016602, "step": 2805 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.32200056314468384, "epoch": 4.496794871794872, "grad_norm": 0.03439924120903015, "learning_rate": 1e-06, "loss": 0.1092, "step": 2806 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.34106314182281494, "epoch": 4.498397435897436, "grad_norm": 0.014802143909037113, "learning_rate": 1e-06, "loss": 0.0375, "step": 2807 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.32713207602500916, "epoch": 4.5, "grad_norm": 0.030605483800172806, "learning_rate": 1e-06, "loss": 0.1174, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4037.0, "completions/mean_length": 2988.58203125, "completions/mean_terminated_length": 2154.842529296875, "completions/min_length": 983.0, "completions/min_terminated_length": 983.0, "entropy": 0.32437823712825775, "epoch": 4.501602564102564, "frac_reward_zero_std": 0.1875, "grad_norm": 1198.33984375, "learning_rate": 1e-06, "loss": 0.0856, "num_tokens": 1683353045.0, "reward": 0.3443542718887329, "reward_std": 0.09359234571456909, "rewards/progression_diversity/mean": -0.0006082241889089346, "rewards/progression_diversity/std": 0.005995499901473522, "rewards/symbolic_reward_accuracy/mean": 0.236328125, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.692138671875, "rewards/symbolic_reward_partial_score/std": 0.2482290267944336, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.045188069343567, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 16.6503849029541, "step": 2809 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.33782175183296204, "epoch": 4.503205128205128, "grad_norm": 0.020003831014037132, "learning_rate": 1e-06, "loss": 0.0686, "step": 2810 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3277200609445572, "epoch": 4.5048076923076925, "grad_norm": 0.016141105443239212, "learning_rate": 1e-06, "loss": 0.0929, "step": 2811 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.32373303174972534, "epoch": 4.506410256410256, "grad_norm": 0.03574444353580475, "learning_rate": 1e-06, "loss": 0.1154, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4322.0, "completions/mean_length": 2614.474609375, "completions/mean_terminated_length": 2112.7509765625, "completions/min_length": 655.0, "completions/min_terminated_length": 655.0, "entropy": 0.3422088921070099, "epoch": 4.50801282051282, "frac_reward_zero_std": 0.375, "grad_norm": 256.3056335449219, "learning_rate": 1e-06, "loss": 0.057, "num_tokens": 1685530120.0, "reward": 0.3261813223361969, "reward_std": 0.06060642749071121, "rewards/progression_diversity/mean": -0.0005208561196923256, "rewards/progression_diversity/std": 0.005044504068791866, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.7187988758087158, "rewards/symbolic_reward_partial_score/std": 0.2249433696269989, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0556640625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 14.019671440124512, "step": 2813 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.32429981231689453, "epoch": 4.509615384615385, "grad_norm": 0.016235308721661568, "learning_rate": 1e-06, "loss": 0.1237, "step": 2814 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.34095147252082825, "epoch": 4.511217948717949, "grad_norm": 0.026136431843042374, "learning_rate": 1e-06, "loss": 0.0271, "step": 2815 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.34518367052078247, "epoch": 4.512820512820513, "grad_norm": 0.012866949662566185, "learning_rate": 1e-06, "loss": 0.0377, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3575.0, "completions/mean_length": 2740.462890625, "completions/mean_terminated_length": 2069.46923828125, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "entropy": 0.33681435883045197, "epoch": 4.514423076923077, "frac_reward_zero_std": 0.3125, "grad_norm": 524.9415283203125, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 1687806261.0, "reward": 0.4213888347148895, "reward_std": 0.08116651326417923, "rewards/progression_diversity/mean": -0.0007689363555982709, "rewards/progression_diversity/std": 0.008490425534546375, "rewards/symbolic_reward_accuracy/mean": 0.337890625, "rewards/symbolic_reward_accuracy/std": 0.4734536409378052, "rewards/symbolic_reward_partial_score/mean": 0.7386393547058105, "rewards/symbolic_reward_partial_score/std": 0.24396677315235138, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0432395935058594, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 18.25098419189453, "step": 2817 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3194080591201782, "epoch": 4.516025641025641, "grad_norm": 0.022730786353349686, "learning_rate": 1e-06, "loss": 0.1341, "step": 2818 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3404367119073868, "epoch": 4.517628205128205, "grad_norm": 0.017860587686300278, "learning_rate": 1e-06, "loss": 0.0195, "step": 2819 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.324150949716568, "epoch": 4.519230769230769, "grad_norm": 0.019054269418120384, "learning_rate": 1e-06, "loss": 0.0698, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3389.0, "completions/mean_length": 3260.208984375, "completions/mean_terminated_length": 2026.348388671875, "completions/min_length": 650.0, "completions/min_terminated_length": 650.0, "entropy": 0.3134499490261078, "epoch": 4.520833333333333, "frac_reward_zero_std": 0.1875, "grad_norm": 1172.327392578125, "learning_rate": 1e-06, "loss": 0.119, "num_tokens": 1690392976.0, "reward": 0.27747392654418945, "reward_std": 0.07828624546527863, "rewards/progression_diversity/mean": -0.0016331763472408056, "rewards/progression_diversity/std": 0.009813708253204823, "rewards/symbolic_reward_accuracy/mean": 0.140625, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.66455078125, "rewards/symbolic_reward_partial_score/std": 0.2490902692079544, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0194971561431885, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 25.523792266845703, "step": 2821 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.28017091751098633, "epoch": 4.522435897435898, "grad_norm": 0.024804720655083656, "learning_rate": 1e-06, "loss": 0.1724, "step": 2822 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.32716619968414307, "epoch": 4.524038461538462, "grad_norm": 0.02028745971620083, "learning_rate": 1e-06, "loss": 0.0362, "step": 2823 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.293626606464386, "epoch": 4.5256410256410255, "grad_norm": 0.01859315298497677, "learning_rate": 1e-06, "loss": 0.152, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4031.0, "completions/mean_length": 2742.6484375, "completions/mean_terminated_length": 2012.8641357421875, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "entropy": 0.33004169166088104, "epoch": 4.527243589743589, "frac_reward_zero_std": 0.375, "grad_norm": 179.26219177246094, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 1692678172.0, "reward": 0.3905848264694214, "reward_std": 0.056361980736255646, "rewards/progression_diversity/mean": -0.00011124266166007146, "rewards/progression_diversity/std": 0.002228989265859127, "rewards/symbolic_reward_accuracy/mean": 0.298828125, "rewards/symbolic_reward_accuracy/std": 0.45819199085235596, "rewards/symbolic_reward_partial_score/mean": 0.716015636920929, "rewards/symbolic_reward_partial_score/std": 0.2630784511566162, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041427493095398, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 19.09334945678711, "step": 2825 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.310644194483757, "epoch": 4.528846153846154, "grad_norm": 0.01855999417603016, "learning_rate": 1e-06, "loss": 0.1514, "step": 2826 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.33532649278640747, "epoch": 4.530448717948718, "grad_norm": 0.017645422369241714, "learning_rate": 1e-06, "loss": 0.0642, "step": 2827 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3198052793741226, "epoch": 4.532051282051282, "grad_norm": 0.01399680133908987, "learning_rate": 1e-06, "loss": 0.0771, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.11328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4372.0, "completions/mean_length": 3640.142578125, "completions/mean_terminated_length": 2012.0726318359375, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "entropy": 0.3003094792366028, "epoch": 4.533653846153846, "frac_reward_zero_std": 0.21875, "grad_norm": 824.9033813476562, "learning_rate": 1e-06, "loss": 0.0933, "num_tokens": 1695515621.0, "reward": 0.29925772547721863, "reward_std": 0.10430952906608582, "rewards/progression_diversity/mean": -1.0345164810132701e-05, "rewards/progression_diversity/std": 0.00019557196355890483, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.6570312976837158, "rewards/symbolic_reward_partial_score/std": 0.28810763359069824, "rewards/tag_count_reward/mean": -0.103515625, "rewards/tag_count_reward/std": 0.30492907762527466, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0081380605697632, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 32.27723693847656, "step": 2829 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2708180397748947, "epoch": 4.535256410256411, "grad_norm": 0.02845774032175541, "learning_rate": 1e-06, "loss": 0.1828, "step": 2830 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.28906436264514923, "epoch": 4.5368589743589745, "grad_norm": 0.015066894702613354, "learning_rate": 1e-06, "loss": 0.1517, "step": 2831 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2919054627418518, "epoch": 4.538461538461538, "grad_norm": 0.027522152289748192, "learning_rate": 1e-06, "loss": 0.1443, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4253.0, "completions/mean_length": 3101.431640625, "completions/mean_terminated_length": 1975.790283203125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "entropy": 0.32231228053569794, "epoch": 4.540064102564102, "frac_reward_zero_std": 0.3125, "grad_norm": 864.8489990234375, "learning_rate": 1e-06, "loss": 0.0609, "num_tokens": 1697946786.0, "reward": 0.3166502118110657, "reward_std": 0.07383254915475845, "rewards/progression_diversity/mean": -1.783898551366292e-05, "rewards/progression_diversity/std": 0.0004036501923110336, "rewards/symbolic_reward_accuracy/mean": 0.19140625, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.6961263418197632, "rewards/symbolic_reward_partial_score/std": 0.26130229234695435, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0254395008087158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 25.37073516845703, "step": 2833 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.29585976898670197, "epoch": 4.541666666666667, "grad_norm": 0.015048340894281864, "learning_rate": 1e-06, "loss": 0.1864, "step": 2834 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.31169450283050537, "epoch": 4.543269230769231, "grad_norm": 0.013053800910711288, "learning_rate": 1e-06, "loss": 0.1106, "step": 2835 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3154265433549881, "epoch": 4.544871794871795, "grad_norm": 0.02868885174393654, "learning_rate": 1e-06, "loss": 0.0882, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4116.0, "completions/mean_length": 2923.740234375, "completions/mean_terminated_length": 1936.090087890625, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "entropy": 0.2929193377494812, "epoch": 4.546474358974359, "frac_reward_zero_std": 0.3125, "grad_norm": 1734.2320556640625, "learning_rate": 1e-06, "loss": 0.1872, "num_tokens": 1700301149.0, "reward": 0.2965981960296631, "reward_std": 0.07857708632946014, "rewards/progression_diversity/mean": -0.0008270339458249509, "rewards/progression_diversity/std": 0.01284003909677267, "rewards/symbolic_reward_accuracy/mean": 0.169921875, "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, "rewards/symbolic_reward_partial_score/mean": 0.669677734375, "rewards/symbolic_reward_partial_score/std": 0.2715960741043091, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0229657888412476, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 27.50617790222168, "step": 2837 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3156014531850815, "epoch": 4.548076923076923, "grad_norm": 0.010393503122031689, "learning_rate": 1e-06, "loss": 0.1186, "step": 2838 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.32449236512184143, "epoch": 4.549679487179487, "grad_norm": 0.015459039248526096, "learning_rate": 1e-06, "loss": 0.0538, "step": 2839 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.31201881170272827, "epoch": 4.551282051282051, "grad_norm": 0.014963453635573387, "learning_rate": 1e-06, "loss": 0.0831, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3925.0, "completions/mean_length": 3557.302734375, "completions/mean_terminated_length": 1950.44189453125, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "entropy": 0.30540113151073456, "epoch": 4.552884615384615, "frac_reward_zero_std": 0.09375, "grad_norm": 626.2611083984375, "learning_rate": 1e-06, "loss": 0.077, "num_tokens": 1703068392.0, "reward": 0.24340294301509857, "reward_std": 0.11287408322095871, "rewards/progression_diversity/mean": -4.04381935368292e-05, "rewards/progression_diversity/std": 0.0005320300115272403, "rewards/symbolic_reward_accuracy/mean": 0.1171875, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.6095215082168579, "rewards/symbolic_reward_partial_score/std": 0.27623260021209717, "rewards/tag_count_reward/mean": -0.09765625, "rewards/tag_count_reward/std": 0.29713961482048035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9910718202590942, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 40.45024108886719, "step": 2841 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2842327356338501, "epoch": 4.55448717948718, "grad_norm": 0.02323756366968155, "learning_rate": 1e-06, "loss": 0.1819, "step": 2842 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.28665095567703247, "epoch": 4.556089743589744, "grad_norm": 0.018670253455638885, "learning_rate": 1e-06, "loss": 0.1907, "step": 2843 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.28130021691322327, "epoch": 4.5576923076923075, "grad_norm": 0.014929386787116528, "learning_rate": 1e-06, "loss": 0.1567, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4265.0, "completions/mean_length": 2879.638671875, "completions/mean_terminated_length": 1888.7525634765625, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "entropy": 0.33259910345077515, "epoch": 4.559294871794872, "frac_reward_zero_std": 0.25, "grad_norm": 539.2830810546875, "learning_rate": 1e-06, "loss": 0.0357, "num_tokens": 1705342479.0, "reward": 0.309814453125, "reward_std": 0.08594641089439392, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.6902669668197632, "rewards/symbolic_reward_partial_score/std": 0.2536517083644867, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0254346132278442, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 25.892318725585938, "step": 2845 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3099818527698517, "epoch": 4.560897435897436, "grad_norm": 0.01687266118824482, "learning_rate": 1e-06, "loss": 0.1278, "step": 2846 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3287712037563324, "epoch": 4.5625, "grad_norm": 0.017019418999552727, "learning_rate": 1e-06, "loss": 0.0366, "step": 2847 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3039703518152237, "epoch": 4.564102564102564, "grad_norm": 0.02699456736445427, "learning_rate": 1e-06, "loss": 0.1679, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3132.0, "completions/mean_length": 3338.453125, "completions/mean_terminated_length": 1957.8228759765625, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "entropy": 0.3064081221818924, "epoch": 4.565705128205128, "frac_reward_zero_std": 0.34375, "grad_norm": 517.185302734375, "learning_rate": 1e-06, "loss": 0.109, "num_tokens": 1707884951.0, "reward": 0.35695183277130127, "reward_std": 0.07652982324361801, "rewards/progression_diversity/mean": -0.00013049774861428887, "rewards/progression_diversity/std": 0.002952827140688896, "rewards/symbolic_reward_accuracy/mean": 0.2421875, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.7328125238418579, "rewards/symbolic_reward_partial_score/std": 0.2740279734134674, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0161539316177368, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 28.80113983154297, "step": 2849 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3108035773038864, "epoch": 4.5673076923076925, "grad_norm": 4669.63525390625, "learning_rate": 1e-06, "loss": 0.3536, "step": 2850 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2923634201288223, "epoch": 4.568910256410256, "grad_norm": 0.014963356778025627, "learning_rate": 1e-06, "loss": 0.1355, "step": 2851 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3147512972354889, "epoch": 4.57051282051282, "grad_norm": 0.009707149118185043, "learning_rate": 1e-06, "loss": 0.1122, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.134765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4114.0, "completions/mean_length": 3972.37890625, "completions/mean_terminated_length": 2039.19189453125, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "entropy": 0.2673351615667343, "epoch": 4.572115384615385, "frac_reward_zero_std": 0.15625, "grad_norm": 499.8143005371094, "learning_rate": 1e-06, "loss": 0.1695, "num_tokens": 1710790361.0, "reward": 0.23631691932678223, "reward_std": 0.09914463013410568, "rewards/progression_diversity/mean": -0.0001452151482226327, "rewards/progression_diversity/std": 0.00327769061550498, "rewards/symbolic_reward_accuracy/mean": 0.103515625, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.6158528327941895, "rewards/symbolic_reward_partial_score/std": 0.28579404950141907, "rewards/tag_count_reward/mean": -0.10546875, "rewards/tag_count_reward/std": 0.3074568510055542, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9861880540847778, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 42.465789794921875, "step": 2853 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.301104798913002, "epoch": 4.573717948717949, "grad_norm": 0.01647542603313923, "learning_rate": 1e-06, "loss": 0.1265, "step": 2854 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2893615812063217, "epoch": 4.575320512820513, "grad_norm": 0.02017975226044655, "learning_rate": 1e-06, "loss": 0.1213, "step": 2855 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.26380933821201324, "epoch": 4.576923076923077, "grad_norm": 0.014531499706208706, "learning_rate": 1e-06, "loss": 0.2342, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.14453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3445.0, "completions/mean_length": 4089.650390625, "completions/mean_terminated_length": 2012.522705078125, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "entropy": 0.27838708460330963, "epoch": 4.578525641025641, "frac_reward_zero_std": 0.28125, "grad_norm": 3233.53173828125, "learning_rate": 1e-06, "loss": 0.1156, "num_tokens": 1713713190.0, "reward": 0.2200063169002533, "reward_std": 0.08358844369649887, "rewards/progression_diversity/mean": -0.0008350086864084005, "rewards/progression_diversity/std": 0.0046890368685126305, "rewards/symbolic_reward_accuracy/mean": 0.087890625, "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, "rewards/symbolic_reward_partial_score/mean": 0.5986166000366211, "rewards/symbolic_reward_partial_score/std": 0.27839624881744385, "rewards/tag_count_reward/mean": -0.123046875, "rewards/tag_count_reward/std": 0.32881227135658264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9900908470153809, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 42.56203842163086, "step": 2857 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.28409095108509064, "epoch": 4.580128205128205, "grad_norm": 359.2823486328125, "learning_rate": 1e-06, "loss": 0.1902, "step": 2858 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2833191752433777, "epoch": 4.581730769230769, "grad_norm": 0.015096834860742092, "learning_rate": 1e-06, "loss": 0.1434, "step": 2859 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2946213483810425, "epoch": 4.583333333333333, "grad_norm": 0.01923026703298092, "learning_rate": 1e-06, "loss": 0.16, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3878.0, "completions/mean_length": 4379.064453125, "completions/mean_terminated_length": 2022.95556640625, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "entropy": 0.282021164894104, "epoch": 4.584935897435898, "frac_reward_zero_std": 0.09375, "grad_norm": 1836.3408203125, "learning_rate": 1e-06, "loss": 0.1013, "num_tokens": 1716882167.0, "reward": 0.23553681373596191, "reward_std": 0.10817793756723404, "rewards/progression_diversity/mean": -0.0044245729222893715, "rewards/progression_diversity/std": 0.01807117834687233, "rewards/symbolic_reward_accuracy/mean": 0.103515625, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.6192545890808105, "rewards/symbolic_reward_partial_score/std": 0.2875533699989319, "rewards/tag_count_reward/mean": -0.123046875, "rewards/tag_count_reward/std": 0.32881227135658264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.978479266166687, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 46.914520263671875, "step": 2861 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2763599753379822, "epoch": 4.586538461538462, "grad_norm": 0.9805912971496582, "learning_rate": 1e-06, "loss": 0.1614, "step": 2862 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.27411825954914093, "epoch": 4.5881410256410255, "grad_norm": 0.016598911955952644, "learning_rate": 1e-06, "loss": 0.163, "step": 2863 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.26957446336746216, "epoch": 4.589743589743589, "grad_norm": 0.02522032894194126, "learning_rate": 1e-06, "loss": 0.2078, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3985.0, "completions/mean_length": 3941.201171875, "completions/mean_terminated_length": 1970.622314453125, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "entropy": 0.2950494736433029, "epoch": 4.591346153846154, "frac_reward_zero_std": 0.25, "grad_norm": 2163.189697265625, "learning_rate": 1e-06, "loss": 0.123, "num_tokens": 1719742142.0, "reward": 0.42619481682777405, "reward_std": 0.12798863649368286, "rewards/progression_diversity/mean": -0.001614722772501409, "rewards/progression_diversity/std": 0.008756079711019993, "rewards/symbolic_reward_accuracy/mean": 0.369140625, "rewards/symbolic_reward_accuracy/std": 0.4830440282821655, "rewards/symbolic_reward_partial_score/mean": 0.7169270515441895, "rewards/symbolic_reward_partial_score/std": 0.3149210810661316, "rewards/tag_count_reward/mean": -0.103515625, "rewards/tag_count_reward/std": 0.30492907762527466, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9912563562393188, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 41.91343307495117, "step": 2865 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2725277543067932, "epoch": 4.592948717948718, "grad_norm": 5.392195224761963, "learning_rate": 1e-06, "loss": 0.1291, "step": 2866 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.29304346442222595, "epoch": 4.594551282051282, "grad_norm": 0.035934969782829285, "learning_rate": 1e-06, "loss": 0.1513, "step": 2867 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3078482747077942, "epoch": 4.596153846153846, "grad_norm": 0.01671292632818222, "learning_rate": 1e-06, "loss": 0.1524, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3282.0, "completions/mean_length": 3543.140625, "completions/mean_terminated_length": 1934.505615234375, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "entropy": 0.3241666555404663, "epoch": 4.597756410256411, "frac_reward_zero_std": 0.09375, "grad_norm": 679.676513671875, "learning_rate": 1e-06, "loss": 0.1035, "num_tokens": 1722401942.0, "reward": 0.302420049905777, "reward_std": 0.12250546365976334, "rewards/progression_diversity/mean": -0.0011605408508330584, "rewards/progression_diversity/std": 0.009744822047650814, "rewards/symbolic_reward_accuracy/mean": 0.189453125, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.656542956829071, "rewards/symbolic_reward_partial_score/std": 0.2855222225189209, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9899616241455078, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 41.70113754272461, "step": 2869 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2940093129873276, "epoch": 4.5993589743589745, "grad_norm": 135809.875, "learning_rate": 1e-06, "loss": 7.7034, "step": 2870 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3006378263235092, "epoch": 4.600961538461538, "grad_norm": 0.014255443587899208, "learning_rate": 1e-06, "loss": 0.122, "step": 2871 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.30779320001602173, "epoch": 4.602564102564102, "grad_norm": 0.01406958419829607, "learning_rate": 1e-06, "loss": 0.1774, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.138671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5307.0, "completions/mean_length": 3947.236328125, "completions/mean_terminated_length": 1944.945556640625, "completions/min_length": 608.0, "completions/min_terminated_length": 608.0, "entropy": 0.29377947747707367, "epoch": 4.604166666666667, "frac_reward_zero_std": 0.125, "grad_norm": 797.7410888671875, "learning_rate": 1e-06, "loss": 0.1212, "num_tokens": 1725344047.0, "reward": 0.32524576783180237, "reward_std": 0.12811732292175293, "rewards/progression_diversity/mean": -0.007164230570197105, "rewards/progression_diversity/std": 0.03267419710755348, "rewards/symbolic_reward_accuracy/mean": 0.2265625, "rewards/symbolic_reward_accuracy/std": 0.4190165400505066, "rewards/symbolic_reward_partial_score/mean": 0.6670736074447632, "rewards/symbolic_reward_partial_score/std": 0.28852909803390503, "rewards/tag_count_reward/mean": -0.107421875, "rewards/tag_count_reward/std": 0.30995169281959534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9761419892311096, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 46.77880859375, "step": 2873 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.28681907057762146, "epoch": 4.605769230769231, "grad_norm": 0.03097737766802311, "learning_rate": 1e-06, "loss": 0.168, "step": 2874 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2869716286659241, "epoch": 4.607371794871795, "grad_norm": 0.017294280230998993, "learning_rate": 1e-06, "loss": 0.1486, "step": 2875 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.26380398869514465, "epoch": 4.608974358974359, "grad_norm": 0.011689919047057629, "learning_rate": 1e-06, "loss": 0.204, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4096.0, "completions/mean_length": 3391.904296875, "completions/mean_terminated_length": 1954.6009521484375, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "entropy": 0.31273353099823, "epoch": 4.610576923076923, "frac_reward_zero_std": 0.21875, "grad_norm": 2286.46435546875, "learning_rate": 1e-06, "loss": 0.144, "num_tokens": 1727923182.0, "reward": 0.346618115901947, "reward_std": 0.1120685264468193, "rewards/progression_diversity/mean": -0.011530693620443344, "rewards/progression_diversity/std": 0.04677894711494446, "rewards/symbolic_reward_accuracy/mean": 0.23828125, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.7052571773529053, "rewards/symbolic_reward_partial_score/std": 0.2821883261203766, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.005671739578247, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 34.7049560546875, "step": 2877 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3126651644706726, "epoch": 4.612179487179487, "grad_norm": 0.12625998258590698, "learning_rate": 1e-06, "loss": 0.0953, "step": 2878 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.30597206950187683, "epoch": 4.613782051282051, "grad_norm": 0.013404784724116325, "learning_rate": 1e-06, "loss": 0.1176, "step": 2879 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2983405888080597, "epoch": 4.615384615384615, "grad_norm": 0.012157008983194828, "learning_rate": 1e-06, "loss": 0.1615, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3449.0, "completions/mean_length": 3446.08203125, "completions/mean_terminated_length": 1889.002197265625, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "entropy": 0.2969917505979538, "epoch": 4.61698717948718, "frac_reward_zero_std": 0.125, "grad_norm": 808.1333618164062, "learning_rate": 1e-06, "loss": 0.1351, "num_tokens": 1730537336.0, "reward": 0.39082223176956177, "reward_std": 0.11836196482181549, "rewards/progression_diversity/mean": -0.011041311547160149, "rewards/progression_diversity/std": 0.04268914833664894, "rewards/symbolic_reward_accuracy/mean": 0.310546875, "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, "rewards/symbolic_reward_partial_score/mean": 0.7113118767738342, "rewards/symbolic_reward_partial_score/std": 0.3017731308937073, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0123034715652466, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 34.338539123535156, "step": 2881 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3034718632698059, "epoch": 4.618589743589744, "grad_norm": 0.022329121828079224, "learning_rate": 1e-06, "loss": 0.1786, "step": 2882 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3132249712944031, "epoch": 4.6201923076923075, "grad_norm": 0.021133504807949066, "learning_rate": 1e-06, "loss": 0.1108, "step": 2883 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.31891536712646484, "epoch": 4.621794871794872, "grad_norm": 0.016432074829936028, "learning_rate": 1e-06, "loss": 0.0594, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3885.0, "completions/mean_length": 2795.095703125, "completions/mean_terminated_length": 1919.3035888671875, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "entropy": 0.3364175856113434, "epoch": 4.623397435897436, "frac_reward_zero_std": 0.28125, "grad_norm": 1410.4578857421875, "learning_rate": 1e-06, "loss": 0.0957, "num_tokens": 1732852201.0, "reward": 0.34519851207733154, "reward_std": 0.0896041989326477, "rewards/progression_diversity/mean": -0.0035866908729076385, "rewards/progression_diversity/std": 0.021069582551717758, "rewards/symbolic_reward_accuracy/mean": 0.22265625, "rewards/symbolic_reward_accuracy/std": 0.41643625497817993, "rewards/symbolic_reward_partial_score/mean": 0.7230468988418579, "rewards/symbolic_reward_partial_score/std": 0.24687045812606812, "rewards/tag_count_reward/mean": -0.052734375, "rewards/tag_count_reward/std": 0.22372129559516907, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.028261423110962, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 28.295589447021484, "step": 2885 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.32780230045318604, "epoch": 4.625, "grad_norm": 0.020152615383267403, "learning_rate": 1e-06, "loss": 0.0954, "step": 2886 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3289777636528015, "epoch": 4.626602564102564, "grad_norm": 0.012928970158100128, "learning_rate": 1e-06, "loss": 0.0939, "step": 2887 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.33029577136039734, "epoch": 4.628205128205128, "grad_norm": 0.013741690665483475, "learning_rate": 1e-06, "loss": 0.114, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 2811.73828125, "completions/mean_terminated_length": 1966.9918212890625, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "entropy": 0.3604893386363983, "epoch": 4.6298076923076925, "frac_reward_zero_std": 0.25, "grad_norm": 329.1974792480469, "learning_rate": 1e-06, "loss": 0.0111, "num_tokens": 1735088787.0, "reward": 0.31703460216522217, "reward_std": 0.06931142508983612, "rewards/progression_diversity/mean": -0.0021051131188869476, "rewards/progression_diversity/std": 0.014493024908006191, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.7118000984191895, "rewards/symbolic_reward_partial_score/std": 0.24182109534740448, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0364668369293213, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 24.83597183227539, "step": 2889 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3358510881662369, "epoch": 4.631410256410256, "grad_norm": 0.026600396260619164, "learning_rate": 1e-06, "loss": 0.0782, "step": 2890 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.34051838517189026, "epoch": 4.63301282051282, "grad_norm": 0.01583019644021988, "learning_rate": 1e-06, "loss": 0.077, "step": 2891 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3082561045885086, "epoch": 4.634615384615385, "grad_norm": 0.013296614401042461, "learning_rate": 1e-06, "loss": 0.1433, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3677.0, "completions/mean_length": 3038.3359375, "completions/mean_terminated_length": 2059.09423828125, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "entropy": 0.33847953379154205, "epoch": 4.636217948717949, "frac_reward_zero_std": 0.34375, "grad_norm": 867.29736328125, "learning_rate": 1e-06, "loss": 0.0601, "num_tokens": 1737559423.0, "reward": 0.36477047204971313, "reward_std": 0.08899857103824615, "rewards/progression_diversity/mean": -0.001956797670572996, "rewards/progression_diversity/std": 0.012748402543365955, "rewards/symbolic_reward_accuracy/mean": 0.26171875, "rewards/symbolic_reward_accuracy/std": 0.44000017642974854, "rewards/symbolic_reward_partial_score/mean": 0.7127115726470947, "rewards/symbolic_reward_partial_score/std": 0.27312228083610535, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.024355411529541, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 30.130250930786133, "step": 2893 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3239692151546478, "epoch": 4.637820512820513, "grad_norm": 0.012062937021255493, "learning_rate": 1e-06, "loss": 0.11, "step": 2894 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3260853737592697, "epoch": 4.639423076923077, "grad_norm": 0.11582206934690475, "learning_rate": 1e-06, "loss": 0.1047, "step": 2895 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31543244421482086, "epoch": 4.641025641025641, "grad_norm": 0.020565569400787354, "learning_rate": 1e-06, "loss": 0.1456, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3929.0, "completions/mean_length": 2900.99609375, "completions/mean_terminated_length": 2120.987548828125, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "entropy": 0.34089401364326477, "epoch": 4.642628205128205, "frac_reward_zero_std": 0.21875, "grad_norm": 597.7639770507812, "learning_rate": 1e-06, "loss": 0.0498, "num_tokens": 1739987869.0, "reward": 0.27188873291015625, "reward_std": 0.067967489361763, "rewards/progression_diversity/mean": -9.193217556457967e-05, "rewards/progression_diversity/std": 0.0012361678527668118, "rewards/symbolic_reward_accuracy/mean": 0.115234375, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.689501941204071, "rewards/symbolic_reward_partial_score/std": 0.22588656842708588, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0352983474731445, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 25.13549041748047, "step": 2897 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3242097795009613, "epoch": 4.644230769230769, "grad_norm": 0.021813800558447838, "learning_rate": 1e-06, "loss": 0.1245, "step": 2898 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.34044331312179565, "epoch": 4.645833333333333, "grad_norm": 0.016379984095692635, "learning_rate": 1e-06, "loss": 0.0982, "step": 2899 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.33183909952640533, "epoch": 4.647435897435898, "grad_norm": 0.04149186238646507, "learning_rate": 1e-06, "loss": 3.0971, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4292.0, "completions/mean_length": 2847.33203125, "completions/mean_terminated_length": 2064.218994140625, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "entropy": 0.32430991530418396, "epoch": 4.649038461538462, "frac_reward_zero_std": 0.21875, "grad_norm": 1954.2508544921875, "learning_rate": 1e-06, "loss": 0.1152, "num_tokens": 1742326855.0, "reward": 0.3111421465873718, "reward_std": 0.07776374369859695, "rewards/progression_diversity/mean": -4.3267915316391736e-05, "rewards/progression_diversity/std": 0.0009790410986170173, "rewards/symbolic_reward_accuracy/mean": 0.1875, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.6751627326011658, "rewards/symbolic_reward_partial_score/std": 0.2522996664047241, "rewards/tag_count_reward/mean": -0.0390625, "rewards/tag_count_reward/std": 0.1939331740140915, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0431500673294067, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 21.82623863220215, "step": 2901 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.34375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.3435054123401642, "epoch": 4.6506410256410255, "grad_norm": 0.013913768343627453, "learning_rate": 1e-06, "loss": 0.0597, "step": 2902 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.34366682171821594, "epoch": 4.652243589743589, "grad_norm": 0.021470922976732254, "learning_rate": 1e-06, "loss": 0.0525, "step": 2903 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3440418541431427, "epoch": 4.653846153846154, "grad_norm": 201.25079345703125, "learning_rate": 1e-06, "loss": 0.0948, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3945.0, "completions/mean_length": 2559.75390625, "completions/mean_terminated_length": 2026.9736328125, "completions/min_length": 664.0, "completions/min_terminated_length": 664.0, "entropy": 0.34741640090942383, "epoch": 4.655448717948718, "frac_reward_zero_std": 0.3125, "grad_norm": 587.72802734375, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 1744472953.0, "reward": 0.3951566517353058, "reward_std": 0.06519210338592529, "rewards/progression_diversity/mean": -0.0004501467919908464, "rewards/progression_diversity/std": 0.005880733020603657, "rewards/symbolic_reward_accuracy/mean": 0.283203125, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.7605631351470947, "rewards/symbolic_reward_partial_score/std": 0.23610848188400269, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0574297904968262, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 15.714860916137695, "step": 2905 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3476281762123108, "epoch": 4.657051282051282, "grad_norm": 0.025904379785060883, "learning_rate": 1e-06, "loss": 0.0288, "step": 2906 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3408808410167694, "epoch": 4.658653846153846, "grad_norm": 0.02337014675140381, "learning_rate": 1e-06, "loss": 0.0792, "step": 2907 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.33761781454086304, "epoch": 4.660256410256411, "grad_norm": 0.013979962095618248, "learning_rate": 1e-06, "loss": 0.0681, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3958.0, "completions/mean_length": 2804.318359375, "completions/mean_terminated_length": 2018.716796875, "completions/min_length": 513.0, "completions/min_terminated_length": 513.0, "entropy": 0.352565199136734, "epoch": 4.6618589743589745, "frac_reward_zero_std": 0.21875, "grad_norm": 0.027550842612981796, "learning_rate": 1e-06, "loss": 0.0123, "num_tokens": 1746766092.0, "reward": 0.33431804180145264, "reward_std": 0.07552418112754822, "rewards/progression_diversity/mean": -0.0008154752431437373, "rewards/progression_diversity/std": 0.007044443394988775, "rewards/symbolic_reward_accuracy/mean": 0.201171875, "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, "rewards/symbolic_reward_partial_score/mean": 0.7244465947151184, "rewards/symbolic_reward_partial_score/std": 0.24222518503665924, "rewards/tag_count_reward/mean": -0.037109375, "rewards/tag_count_reward/std": 0.18921469151973724, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0389173030853271, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 21.87183380126953, "step": 2909 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3289457857608795, "epoch": 4.663461538461538, "grad_norm": 0.018005970865488052, "learning_rate": 1e-06, "loss": 0.1036, "step": 2910 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.31602902710437775, "epoch": 4.665064102564102, "grad_norm": 0.015265545807778835, "learning_rate": 1e-06, "loss": 0.0927, "step": 2911 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.32312436401844025, "epoch": 4.666666666666667, "grad_norm": 0.010948474518954754, "learning_rate": 1e-06, "loss": 0.1255, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3403.0, "completions/mean_length": 2489.4609375, "completions/mean_terminated_length": 1953.9715576171875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "entropy": 0.32907360792160034, "epoch": 4.668269230769231, "frac_reward_zero_std": 0.28125, "grad_norm": 1542.9832763671875, "learning_rate": 1e-06, "loss": 0.0546, "num_tokens": 1748830296.0, "reward": 0.3275552988052368, "reward_std": 0.04779002070426941, "rewards/progression_diversity/mean": -0.000332732277456671, "rewards/progression_diversity/std": 0.004427206236869097, "rewards/symbolic_reward_accuracy/mean": 0.1796875, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.7376953363418579, "rewards/symbolic_reward_partial_score/std": 0.21765783429145813, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0540077686309814, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 14.962639808654785, "step": 2913 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3347413241863251, "epoch": 4.669871794871795, "grad_norm": 0.02077837660908699, "learning_rate": 1e-06, "loss": 0.0577, "step": 2914 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.35084839165210724, "epoch": 4.671474358974359, "grad_norm": 0.1329037845134735, "learning_rate": 1e-06, "loss": 0.0075, "step": 2915 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3418993204832077, "epoch": 4.673076923076923, "grad_norm": 0.035508181899785995, "learning_rate": 1e-06, "loss": 0.0666, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.056640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3737.0, "completions/mean_length": 2839.4609375, "completions/mean_terminated_length": 2026.227783203125, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "entropy": 0.3173777014017105, "epoch": 4.674679487179487, "frac_reward_zero_std": 0.28125, "grad_norm": 1100.101806640625, "learning_rate": 1e-06, "loss": 0.1087, "num_tokens": 1751308340.0, "reward": 0.3337549567222595, "reward_std": 0.0646740198135376, "rewards/progression_diversity/mean": -0.000972743786405772, "rewards/progression_diversity/std": 0.006808849982917309, "rewards/symbolic_reward_accuracy/mean": 0.21875, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.690673828125, "rewards/symbolic_reward_partial_score/std": 0.2509380280971527, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.034561276435852, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 23.243541717529297, "step": 2917 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3356782793998718, "epoch": 4.676282051282051, "grad_norm": 0.01395330298691988, "learning_rate": 1e-06, "loss": 0.043, "step": 2918 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.30980144441127777, "epoch": 4.677884615384615, "grad_norm": 0.015020109713077545, "learning_rate": 1e-06, "loss": 0.1587, "step": 2919 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3269190788269043, "epoch": 4.67948717948718, "grad_norm": 0.10749267786741257, "learning_rate": 1e-06, "loss": 0.0676, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.060546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3986.0, "completions/mean_length": 2943.259765625, "completions/mean_terminated_length": 2077.0166015625, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "entropy": 0.3407384008169174, "epoch": 4.681089743589744, "frac_reward_zero_std": 0.3125, "grad_norm": 1545.7191162109375, "learning_rate": 1e-06, "loss": 0.0606, "num_tokens": 1753724633.0, "reward": 0.3297162652015686, "reward_std": 0.0669044628739357, "rewards/progression_diversity/mean": -0.0015185039956122637, "rewards/progression_diversity/std": 0.012093695811927319, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.6928547620773315, "rewards/symbolic_reward_partial_score/std": 0.24812039732933044, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0323323011398315, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 27.282554626464844, "step": 2921 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.34692129492759705, "epoch": 4.6826923076923075, "grad_norm": 0.01900269091129303, "learning_rate": 1e-06, "loss": 0.0044, "step": 2922 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3168996274471283, "epoch": 4.684294871794872, "grad_norm": 0.011829662136733532, "learning_rate": 1e-06, "loss": 0.1236, "step": 2923 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3172369599342346, "epoch": 4.685897435897436, "grad_norm": 0.026483498513698578, "learning_rate": 1e-06, "loss": 0.1768, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.037109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3942.0, "completions/mean_length": 2581.607421875, "completions/mean_terminated_length": 2049.66943359375, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "entropy": 0.3523339778184891, "epoch": 4.6875, "frac_reward_zero_std": 0.28125, "grad_norm": 861.6783447265625, "learning_rate": 1e-06, "loss": 0.0552, "num_tokens": 1755836432.0, "reward": 0.2985629141330719, "reward_std": 0.05261341854929924, "rewards/progression_diversity/mean": -0.0006439232383854687, "rewards/progression_diversity/std": 0.005445914342999458, "rewards/symbolic_reward_accuracy/mean": 0.1484375, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.7100748419761658, "rewards/symbolic_reward_partial_score/std": 0.2193126529455185, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0577545166015625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 16.59449577331543, "step": 2925 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.34911976754665375, "epoch": 4.689102564102564, "grad_norm": 0.014677703380584717, "learning_rate": 1e-06, "loss": 0.0362, "step": 2926 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.34325067698955536, "epoch": 4.690705128205128, "grad_norm": 0.016701413318514824, "learning_rate": 1e-06, "loss": 0.0602, "step": 2927 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.34585435688495636, "epoch": 4.6923076923076925, "grad_norm": 0.026525946334004402, "learning_rate": 1e-06, "loss": 0.0583, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3789.0, "completions/mean_length": 2502.28515625, "completions/mean_terminated_length": 2054.48779296875, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "entropy": 0.3411019742488861, "epoch": 4.693910256410256, "frac_reward_zero_std": 0.3125, "grad_norm": 701.24560546875, "learning_rate": 1e-06, "loss": 0.1057, "num_tokens": 1757960018.0, "reward": 0.4438255727291107, "reward_std": 0.05670292302966118, "rewards/progression_diversity/mean": -0.0007465816452167928, "rewards/progression_diversity/std": 0.006228178273886442, "rewards/symbolic_reward_accuracy/mean": 0.361328125, "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, "rewards/symbolic_reward_partial_score/mean": 0.7639485597610474, "rewards/symbolic_reward_partial_score/std": 0.24817843735218048, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0602136850357056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 15.609932899475098, "step": 2929 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3556542992591858, "epoch": 4.69551282051282, "grad_norm": 0.02133711613714695, "learning_rate": 1e-06, "loss": 0.0417, "step": 2930 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3535788506269455, "epoch": 4.697115384615385, "grad_norm": 0.012668832205235958, "learning_rate": 1e-06, "loss": 0.0334, "step": 2931 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.34661364555358887, "epoch": 4.698717948717949, "grad_norm": 0.013216467574238777, "learning_rate": 1e-06, "loss": 0.0117, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3756.0, "completions/mean_length": 2681.73046875, "completions/mean_terminated_length": 2037.2474365234375, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "entropy": 0.3384409695863724, "epoch": 4.700320512820513, "frac_reward_zero_std": 0.25, "grad_norm": 556.1314697265625, "learning_rate": 1e-06, "loss": 0.0886, "num_tokens": 1760146200.0, "reward": 0.3850386440753937, "reward_std": 0.08345068991184235, "rewards/progression_diversity/mean": -0.0010191474575549364, "rewards/progression_diversity/std": 0.0071332212537527084, "rewards/symbolic_reward_accuracy/mean": 0.279296875, "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, "rewards/symbolic_reward_partial_score/mean": 0.738574206829071, "rewards/symbolic_reward_partial_score/std": 0.2675086259841919, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0498528480529785, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 20.236854553222656, "step": 2933 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.33859123289585114, "epoch": 4.701923076923077, "grad_norm": 0.012142288498580456, "learning_rate": 1e-06, "loss": 0.0558, "step": 2934 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.3401936739683151, "epoch": 4.703525641025641, "grad_norm": 0.01944906637072563, "learning_rate": 1e-06, "loss": 0.0776, "step": 2935 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.34829023480415344, "epoch": 4.705128205128205, "grad_norm": 0.015569426119327545, "learning_rate": 1e-06, "loss": 0.0491, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3820.0, "completions/mean_length": 2710.529296875, "completions/mean_terminated_length": 2038.0633544921875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "entropy": 0.3164035826921463, "epoch": 4.706730769230769, "frac_reward_zero_std": 0.3125, "grad_norm": 1386.5074462890625, "learning_rate": 1e-06, "loss": 0.1388, "num_tokens": 1762370215.0, "reward": 0.3807540237903595, "reward_std": 0.08757130056619644, "rewards/progression_diversity/mean": -0.00028213387122377753, "rewards/progression_diversity/std": 0.002852478064596653, "rewards/symbolic_reward_accuracy/mean": 0.26953125, "rewards/symbolic_reward_accuracy/std": 0.44415023922920227, "rewards/symbolic_reward_partial_score/mean": 0.7457519769668579, "rewards/symbolic_reward_partial_score/std": 0.2586316764354706, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.044259786605835, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 21.89950942993164, "step": 2937 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.35227370262145996, "epoch": 4.708333333333333, "grad_norm": 0.01955762878060341, "learning_rate": 1e-06, "loss": 0.0149, "step": 2938 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3245883733034134, "epoch": 4.709935897435898, "grad_norm": 0.016230937093496323, "learning_rate": 1e-06, "loss": 0.1317, "step": 2939 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.349644273519516, "epoch": 4.711538461538462, "grad_norm": 0.029934851452708244, "learning_rate": 1e-06, "loss": 0.0294, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4126.0, "completions/mean_length": 2700.505859375, "completions/mean_terminated_length": 2056.90576171875, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "entropy": 0.3398941606283188, "epoch": 4.7131410256410255, "frac_reward_zero_std": 0.1875, "grad_norm": 1812.2177734375, "learning_rate": 1e-06, "loss": 0.0752, "num_tokens": 1764614458.0, "reward": 0.33322569727897644, "reward_std": 0.07418441027402878, "rewards/progression_diversity/mean": -0.00018588484090287238, "rewards/progression_diversity/std": 0.003068706952035427, "rewards/symbolic_reward_accuracy/mean": 0.2109375, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.7025552988052368, "rewards/symbolic_reward_partial_score/std": 0.25797954201698303, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0462868213653564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 20.06338882446289, "step": 2941 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.34091413021087646, "epoch": 4.714743589743589, "grad_norm": 0.012703510001301765, "learning_rate": 1e-06, "loss": 0.0848, "step": 2942 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3459191471338272, "epoch": 4.716346153846154, "grad_norm": 0.11944550275802612, "learning_rate": 1e-06, "loss": 0.0311, "step": 2943 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.31565621495246887, "epoch": 4.717948717948718, "grad_norm": 0.027011489495635033, "learning_rate": 1e-06, "loss": 0.1477, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 5223.0, "completions/mean_length": 2783.65234375, "completions/mean_terminated_length": 2026.5196533203125, "completions/min_length": 704.0, "completions/min_terminated_length": 704.0, "entropy": 0.3283272385597229, "epoch": 4.719551282051282, "frac_reward_zero_std": 0.3125, "grad_norm": 1712.640625, "learning_rate": 1e-06, "loss": 0.1465, "num_tokens": 1766886824.0, "reward": 0.4009649455547333, "reward_std": 0.07681870460510254, "rewards/progression_diversity/mean": -0.0001883438671939075, "rewards/progression_diversity/std": 0.0030330359004437923, "rewards/symbolic_reward_accuracy/mean": 0.30078125, "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, "rewards/symbolic_reward_partial_score/mean": 0.751269519329071, "rewards/symbolic_reward_partial_score/std": 0.2660304307937622, "rewards/tag_count_reward/mean": -0.048828125, "rewards/tag_count_reward/std": 0.2157193273305893, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0471668243408203, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 20.307308197021484, "step": 2945 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.34363169968128204, "epoch": 4.721153846153846, "grad_norm": 0.8427232503890991, "learning_rate": 1e-06, "loss": 0.0328, "step": 2946 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3473658561706543, "epoch": 4.722756410256411, "grad_norm": 0.014580237679183483, "learning_rate": 1e-06, "loss": 0.0303, "step": 2947 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.32401567697525024, "epoch": 4.7243589743589745, "grad_norm": 0.0191756933927536, "learning_rate": 1e-06, "loss": 0.1417, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3855.0, "completions/mean_length": 3248.55078125, "completions/mean_terminated_length": 2044.234619140625, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "entropy": 0.31611892580986023, "epoch": 4.725961538461538, "frac_reward_zero_std": 0.125, "grad_norm": 2376.0654296875, "learning_rate": 1e-06, "loss": 0.1231, "num_tokens": 1769473858.0, "reward": 0.37166327238082886, "reward_std": 0.13059641420841217, "rewards/progression_diversity/mean": -0.00017961920821107924, "rewards/progression_diversity/std": 0.00354160089045763, "rewards/symbolic_reward_accuracy/mean": 0.26953125, "rewards/symbolic_reward_accuracy/std": 0.44415023922920227, "rewards/symbolic_reward_partial_score/mean": 0.724560558795929, "rewards/symbolic_reward_partial_score/std": 0.29390403628349304, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.021704912185669, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 30.869539260864258, "step": 2949 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3012668788433075, "epoch": 4.727564102564102, "grad_norm": 0.017393101006746292, "learning_rate": 1e-06, "loss": 0.1819, "step": 2950 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3096124082803726, "epoch": 4.729166666666667, "grad_norm": 0.22197610139846802, "learning_rate": 1e-06, "loss": 0.1091, "step": 2951 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3276834934949875, "epoch": 4.730769230769231, "grad_norm": 0.0192260779440403, "learning_rate": 1e-06, "loss": 0.0842, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4381.0, "completions/mean_length": 2684.7734375, "completions/mean_terminated_length": 2040.4334716796875, "completions/min_length": 447.0, "completions/min_terminated_length": 447.0, "entropy": 0.3336394876241684, "epoch": 4.732371794871795, "frac_reward_zero_std": 0.3125, "grad_norm": 2094.456298828125, "learning_rate": 1e-06, "loss": 0.0981, "num_tokens": 1771661390.0, "reward": 0.2928215265274048, "reward_std": 0.06420518457889557, "rewards/progression_diversity/mean": -7.55006549297832e-05, "rewards/progression_diversity/std": 0.001374648418277502, "rewards/symbolic_reward_accuracy/mean": 0.1484375, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.6941732168197632, "rewards/symbolic_reward_partial_score/std": 0.24089013040065765, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0465376377105713, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 20.815563201904297, "step": 2953 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3418269008398056, "epoch": 4.733974358974359, "grad_norm": 0.02304861880838871, "learning_rate": 1e-06, "loss": 0.0803, "step": 2954 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3348529040813446, "epoch": 4.735576923076923, "grad_norm": 0.02030983939766884, "learning_rate": 1e-06, "loss": 0.0815, "step": 2955 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3480427414178848, "epoch": 4.737179487179487, "grad_norm": 0.012364787049591541, "learning_rate": 1e-06, "loss": 0.0586, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.06640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3864.0, "completions/mean_length": 3018.962890625, "completions/mean_terminated_length": 2068.311767578125, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "entropy": 0.30448389053344727, "epoch": 4.738782051282051, "frac_reward_zero_std": 0.0625, "grad_norm": 8229.33984375, "learning_rate": 1e-06, "loss": 0.1366, "num_tokens": 1774171099.0, "reward": 0.29637160897254944, "reward_std": 0.10101476311683655, "rewards/progression_diversity/mean": -4.6070697862887755e-05, "rewards/progression_diversity/std": 0.0010424609063193202, "rewards/symbolic_reward_accuracy/mean": 0.154296875, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.6994954347610474, "rewards/symbolic_reward_partial_score/std": 0.25719350576400757, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0173419713974, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 31.943614959716797, "step": 2957 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3160858005285263, "epoch": 4.740384615384615, "grad_norm": 0.03362959623336792, "learning_rate": 1e-06, "loss": 0.1305, "step": 2958 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3264904320240021, "epoch": 4.74198717948718, "grad_norm": 0.018001394346356392, "learning_rate": 1e-06, "loss": 0.06, "step": 2959 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.30660150945186615, "epoch": 4.743589743589744, "grad_norm": 0.01249605417251587, "learning_rate": 1e-06, "loss": 0.144, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3706.0, "completions/mean_length": 2441.875, "completions/mean_terminated_length": 1992.1290283203125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "entropy": 0.35260292887687683, "epoch": 4.7451923076923075, "frac_reward_zero_std": 0.375, "grad_norm": 488.8838806152344, "learning_rate": 1e-06, "loss": 0.021, "num_tokens": 1776175595.0, "reward": 0.36955156922340393, "reward_std": 0.05210120975971222, "rewards/progression_diversity/mean": -0.0004122033715248108, "rewards/progression_diversity/std": 0.007343790493905544, "rewards/symbolic_reward_accuracy/mean": 0.244140625, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.7539876103401184, "rewards/symbolic_reward_partial_score/std": 0.2148759663105011, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0614118576049805, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 13.594598770141602, "step": 2961 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.347807839512825, "epoch": 4.746794871794872, "grad_norm": 0.016412392258644104, "learning_rate": 1e-06, "loss": 0.0841, "step": 2962 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.33857110142707825, "epoch": 4.748397435897436, "grad_norm": 0.016963588073849678, "learning_rate": 1e-06, "loss": 0.0678, "step": 2963 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.34765180945396423, "epoch": 4.75, "grad_norm": 0.03191974386572838, "learning_rate": 1e-06, "loss": 0.0283, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4131.0, "completions/mean_length": 3280.611328125, "completions/mean_terminated_length": 2079.234619140625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "entropy": 0.29712438583374023, "epoch": 4.751602564102564, "frac_reward_zero_std": 0.1875, "grad_norm": 3667.5302734375, "learning_rate": 1e-06, "loss": 0.1519, "num_tokens": 1778749860.0, "reward": 0.3474825918674469, "reward_std": 0.08697932213544846, "rewards/progression_diversity/mean": -0.00027683598455041647, "rewards/progression_diversity/std": 0.002760024508461356, "rewards/symbolic_reward_accuracy/mean": 0.244140625, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.6973470449447632, "rewards/symbolic_reward_partial_score/std": 0.2716548442840576, "rewards/tag_count_reward/mean": -0.08203125, "rewards/tag_count_reward/std": 0.2746807038784027, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0109891891479492, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 36.200584411621094, "step": 2965 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3089023381471634, "epoch": 4.753205128205128, "grad_norm": 0.01566610299050808, "learning_rate": 1e-06, "loss": 0.2778, "step": 2966 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.2989957928657532, "epoch": 4.7548076923076925, "grad_norm": 0.03153858333826065, "learning_rate": 1e-06, "loss": 0.1536, "step": 2967 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3228546977043152, "epoch": 4.756410256410256, "grad_norm": 0.017119932919740677, "learning_rate": 1e-06, "loss": 0.0581, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3993.0, "completions/mean_length": 3280.58203125, "completions/mean_terminated_length": 2048.636962890625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "entropy": 0.303473562002182, "epoch": 4.75801282051282, "frac_reward_zero_std": 0.125, "grad_norm": 2500.171142578125, "learning_rate": 1e-06, "loss": 0.1769, "num_tokens": 1781321806.0, "reward": 0.26970726251602173, "reward_std": 0.08941252529621124, "rewards/progression_diversity/mean": -0.0009572784183546901, "rewards/progression_diversity/std": 0.009158154018223286, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.661425769329071, "rewards/symbolic_reward_partial_score/std": 0.2673282027244568, "rewards/tag_count_reward/mean": -0.083984375, "rewards/tag_count_reward/std": 0.2776356339454651, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0137133598327637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 36.107513427734375, "step": 2969 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3026905506849289, "epoch": 4.759615384615385, "grad_norm": 0.021056868135929108, "learning_rate": 1e-06, "loss": 0.14, "step": 2970 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.3188410997390747, "epoch": 4.761217948717949, "grad_norm": 0.01921817846596241, "learning_rate": 1e-06, "loss": 0.095, "step": 2971 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.32021498680114746, "epoch": 4.762820512820513, "grad_norm": 0.04374043643474579, "learning_rate": 1e-06, "loss": 0.0584, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3804.0, "completions/mean_length": 3198.404296875, "completions/mean_terminated_length": 2050.61376953125, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "entropy": 0.3287845551967621, "epoch": 4.764423076923077, "frac_reward_zero_std": 0.28125, "grad_norm": 576.1661987304688, "learning_rate": 1e-06, "loss": 0.0627, "num_tokens": 1783830109.0, "reward": 0.3199334740638733, "reward_std": 0.08054739236831665, "rewards/progression_diversity/mean": -0.0003103530907537788, "rewards/progression_diversity/std": 0.003829639870673418, "rewards/symbolic_reward_accuracy/mean": 0.193359375, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.7044758796691895, "rewards/symbolic_reward_partial_score/std": 0.27706295251846313, "rewards/tag_count_reward/mean": -0.07421875, "rewards/tag_count_reward/std": 0.2623828947544098, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.010635495185852, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 784.0, "sampling/sampling_logp_difference/mean": 36.60023498535156, "step": 2973 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3081291615962982, "epoch": 4.766025641025641, "grad_norm": 0.018835294991731644, "learning_rate": 1e-06, "loss": 0.1503, "step": 2974 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.29927410185337067, "epoch": 4.767628205128205, "grad_norm": 0.013380290009081364, "learning_rate": 1e-06, "loss": 0.1841, "step": 2975 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.322013258934021, "epoch": 4.769230769230769, "grad_norm": 0.016294099390506744, "learning_rate": 1e-06, "loss": 0.09, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 4037.0, "completions/mean_length": 3144.052734375, "completions/mean_terminated_length": 2142.71240234375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 0.3358401507139206, "epoch": 4.770833333333333, "frac_reward_zero_std": 0.3125, "grad_norm": 1361.25830078125, "learning_rate": 1e-06, "loss": 0.0311, "num_tokens": 1786338952.0, "reward": 0.34467262029647827, "reward_std": 0.06627324223518372, "rewards/progression_diversity/mean": -2.456928814353887e-05, "rewards/progression_diversity/std": 0.000555939506739378, "rewards/symbolic_reward_accuracy/mean": 0.24609375, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.6788574457168579, "rewards/symbolic_reward_partial_score/std": 0.2802160680294037, "rewards/tag_count_reward/mean": -0.06640625, "rewards/tag_count_reward/std": 0.2492343932390213, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0176117420196533, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 780.0, "sampling/sampling_logp_difference/mean": 32.03617858886719, "step": 2977 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.31088291108608246, "epoch": 4.772435897435898, "grad_norm": 0.012580040842294693, "learning_rate": 1e-06, "loss": 0.1263, "step": 2978 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31900927424430847, "epoch": 4.774038461538462, "grad_norm": 0.012707662768661976, "learning_rate": 1e-06, "loss": 0.0779, "step": 2979 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.29433473944664, "epoch": 4.7756410256410255, "grad_norm": 0.01724996045231819, "learning_rate": 1e-06, "loss": 0.2277, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3774.0, "completions/mean_length": 3808.806640625, "completions/mean_terminated_length": 2170.97802734375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.28534041345119476, "epoch": 4.777243589743589, "frac_reward_zero_std": 0.15625, "grad_norm": 1845.77587890625, "learning_rate": 1e-06, "loss": 0.1926, "num_tokens": 1789274645.0, "reward": 0.25106510519981384, "reward_std": 0.11057647317647934, "rewards/progression_diversity/mean": -0.0009115600259974599, "rewards/progression_diversity/std": 0.0063006458804011345, "rewards/symbolic_reward_accuracy/mean": 0.130859375, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.6110025644302368, "rewards/symbolic_reward_partial_score/std": 0.29573264718055725, "rewards/tag_count_reward/mean": -0.107421875, "rewards/tag_count_reward/std": 0.30995169281959534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9930451512336731, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 40.782203674316406, "step": 2981 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3515625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.303585946559906, "epoch": 4.778846153846154, "grad_norm": 0.012852794490754604, "learning_rate": 1e-06, "loss": 0.1133, "step": 2982 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2768773287534714, "epoch": 4.780448717948718, "grad_norm": 0.10220827162265778, "learning_rate": 1e-06, "loss": 0.2116, "step": 2983 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.30345627665519714, "epoch": 4.782051282051282, "grad_norm": 0.06796098500490189, "learning_rate": 1e-06, "loss": 0.104, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3753.0, "completions/mean_length": 3173.6015625, "completions/mean_terminated_length": 2174.495849609375, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "entropy": 0.30603721737861633, "epoch": 4.783653846153846, "frac_reward_zero_std": 0.1875, "grad_norm": 5978.078125, "learning_rate": 1e-06, "loss": 0.1597, "num_tokens": 1791752489.0, "reward": 0.3074996769428253, "reward_std": 0.0968872681260109, "rewards/progression_diversity/mean": -0.0014983013970777392, "rewards/progression_diversity/std": 0.009809834882616997, "rewards/symbolic_reward_accuracy/mean": 0.19140625, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.6656738519668579, "rewards/symbolic_reward_partial_score/std": 0.2579022943973541, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0267293453216553, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 26.516414642333984, "step": 2985 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.32077351212501526, "epoch": 4.785256410256411, "grad_norm": 0.03152855858206749, "learning_rate": 1e-06, "loss": 0.1114, "step": 2986 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3265487253665924, "epoch": 4.7868589743589745, "grad_norm": 5331029908783104.0, "learning_rate": 1e-06, "loss": 734216716288.0, "step": 2987 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.31940317153930664, "epoch": 4.788461538461538, "grad_norm": 18845.392578125, "learning_rate": 1e-06, "loss": 5.7057, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4206.0, "completions/mean_length": 3355.01953125, "completions/mean_terminated_length": 2160.46484375, "completions/min_length": 952.0, "completions/min_terminated_length": 952.0, "entropy": 0.3106093406677246, "epoch": 4.790064102564102, "frac_reward_zero_std": 0.25, "grad_norm": 1507.441650390625, "learning_rate": 1e-06, "loss": 0.097, "num_tokens": 1794362163.0, "reward": 0.36152663826942444, "reward_std": 0.09895937889814377, "rewards/progression_diversity/mean": -0.0021223740186542273, "rewards/progression_diversity/std": 0.013677747920155525, "rewards/symbolic_reward_accuracy/mean": 0.2578125, "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, "rewards/symbolic_reward_partial_score/mean": 0.7129719853401184, "rewards/symbolic_reward_partial_score/std": 0.2863211929798126, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0154645442962646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 764.0, "sampling/sampling_logp_difference/mean": 28.80075454711914, "step": 2989 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.28619883954524994, "epoch": 4.791666666666667, "grad_norm": 0.02123548462986946, "learning_rate": 1e-06, "loss": 0.1906, "step": 2990 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.29719385504722595, "epoch": 4.793269230769231, "grad_norm": 0.01841905526816845, "learning_rate": 1e-06, "loss": 0.1245, "step": 2991 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3073057234287262, "epoch": 4.794871794871795, "grad_norm": 0.01792910508811474, "learning_rate": 1e-06, "loss": 0.0712, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.13671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 4189.0, "completions/mean_length": 3995.111328125, "completions/mean_terminated_length": 2033.0701904296875, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.2574385032057762, "epoch": 4.796474358974359, "frac_reward_zero_std": 0.1875, "grad_norm": 2505.857666015625, "learning_rate": 1e-06, "loss": 0.197, "num_tokens": 1797306380.0, "reward": 0.26426079869270325, "reward_std": 0.08823113143444061, "rewards/progression_diversity/mean": -0.000679816585034132, "rewards/progression_diversity/std": 0.004317981190979481, "rewards/symbolic_reward_accuracy/mean": 0.15234375, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.6165690422058105, "rewards/symbolic_reward_partial_score/std": 0.3016470968723297, "rewards/tag_count_reward/mean": -0.12109375, "rewards/tag_count_reward/std": 0.3265552520751953, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9852645397186279, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 33.74715042114258, "step": 2993 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2707083225250244, "epoch": 4.798076923076923, "grad_norm": 0.09528608620166779, "learning_rate": 1e-06, "loss": 0.13, "step": 2994 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2617756277322769, "epoch": 4.799679487179487, "grad_norm": 0.01799207553267479, "learning_rate": 1e-06, "loss": 0.1663, "step": 2995 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.28101493418216705, "epoch": 4.801282051282051, "grad_norm": 0.01640203595161438, "learning_rate": 1e-06, "loss": 0.153, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3428.0, "completions/mean_length": 3477.884765625, "completions/mean_terminated_length": 2018.9324951171875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.3022962361574173, "epoch": 4.802884615384615, "frac_reward_zero_std": 0.25, "grad_norm": 2163.138427734375, "learning_rate": 1e-06, "loss": 0.096, "num_tokens": 1799923057.0, "reward": 0.2933644652366638, "reward_std": 0.09190071374177933, "rewards/progression_diversity/mean": -0.00046986271627247334, "rewards/progression_diversity/std": 0.004312288947403431, "rewards/symbolic_reward_accuracy/mean": 0.162109375, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.6836262941360474, "rewards/symbolic_reward_partial_score/std": 0.27113112807273865, "rewards/tag_count_reward/mean": -0.08984375, "rewards/tag_count_reward/std": 0.2862374484539032, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0017528533935547, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 760.0, "sampling/sampling_logp_difference/mean": 26.413158416748047, "step": 2997 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2857065796852112, "epoch": 4.80448717948718, "grad_norm": 0.01477570179849863, "learning_rate": 1e-06, "loss": 0.2033, "step": 2998 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2832331210374832, "epoch": 4.806089743589744, "grad_norm": 0.016135964542627335, "learning_rate": 1e-06, "loss": 0.9264, "step": 2999 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30287647247314453, "epoch": 4.8076923076923075, "grad_norm": 3408249.25, "learning_rate": 1e-06, "loss": 1910.0106, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.099609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4039.0, "completions/mean_length": 3437.560546875, "completions/mean_terminated_length": 2005.30810546875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "entropy": 0.31259530782699585, "epoch": 4.809294871794872, "frac_reward_zero_std": 0.21875, "grad_norm": 722.9392700195312, "learning_rate": 1e-06, "loss": 0.0731, "num_tokens": 1802489712.0, "reward": 0.3412483334541321, "reward_std": 0.09203283488750458, "rewards/progression_diversity/mean": -0.00016795920964796096, "rewards/progression_diversity/std": 0.0027168295346200466, "rewards/symbolic_reward_accuracy/mean": 0.234375, "rewards/symbolic_reward_accuracy/std": 0.42402184009552, "rewards/symbolic_reward_partial_score/mean": 0.7013020515441895, "rewards/symbolic_reward_partial_score/std": 0.2958465814590454, "rewards/tag_count_reward/mean": -0.09765625, "rewards/tag_count_reward/std": 0.29713961482048035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0116255283355713, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 22.6229248046875, "step": 3001 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3023006319999695, "epoch": 4.810897435897436, "grad_norm": 0.01664135977625847, "learning_rate": 1e-06, "loss": 0.1274, "step": 3002 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.30224159359931946, "epoch": 4.8125, "grad_norm": 0.016737397760152817, "learning_rate": 1e-06, "loss": 0.0899, "step": 3003 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2703643888235092, "epoch": 4.814102564102564, "grad_norm": 0.019325420260429382, "learning_rate": 1e-06, "loss": 0.2522, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3899.0, "completions/mean_length": 3158.623046875, "completions/mean_terminated_length": 2007.3695068359375, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.29125888645648956, "epoch": 4.815705128205128, "frac_reward_zero_std": 0.25, "grad_norm": 1334.0791015625, "learning_rate": 1e-06, "loss": 0.1156, "num_tokens": 1804920927.0, "reward": 0.4101855754852295, "reward_std": 0.08342595398426056, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.328125, "rewards/symbolic_reward_accuracy/std": 0.4699897766113281, "rewards/symbolic_reward_partial_score/mean": 0.7468424439430237, "rewards/symbolic_reward_partial_score/std": 0.2777378261089325, "rewards/tag_count_reward/mean": -0.107421875, "rewards/tag_count_reward/std": 0.30995169281959534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0229686498641968, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 15.124759674072266, "step": 3005 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2940514087677002, "epoch": 4.8173076923076925, "grad_norm": 0.030098246410489082, "learning_rate": 1e-06, "loss": 0.1046, "step": 3006 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.29679183661937714, "epoch": 4.818910256410256, "grad_norm": 0.13558229804039001, "learning_rate": 1e-06, "loss": 0.0689, "step": 3007 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.28803931176662445, "epoch": 4.82051282051282, "grad_norm": 0.028529301285743713, "learning_rate": 1e-06, "loss": 0.0629, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 3604.537109375, "completions/mean_terminated_length": 2066.52734375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "entropy": 0.2692013829946518, "epoch": 4.822115384615385, "frac_reward_zero_std": 0.125, "grad_norm": 2515.8955078125, "learning_rate": 1e-06, "loss": 0.1519, "num_tokens": 1807674082.0, "reward": 0.2645507752895355, "reward_std": 0.10678978264331818, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.138671875, "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, "rewards/symbolic_reward_partial_score/mean": 0.6370442509651184, "rewards/symbolic_reward_partial_score/std": 0.28384286165237427, "rewards/tag_count_reward/mean": -0.09765625, "rewards/tag_count_reward/std": 0.29713961482048035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.983115553855896, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 29.70915985107422, "step": 3009 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.26541532576084137, "epoch": 4.823717948717949, "grad_norm": 0.19816499948501587, "learning_rate": 1e-06, "loss": 0.1234, "step": 3010 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.2563439905643463, "epoch": 4.825320512820513, "grad_norm": 0.014676299877464771, "learning_rate": 1e-06, "loss": 0.152, "step": 3011 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.27064676582813263, "epoch": 4.826923076923077, "grad_norm": 0.022342152893543243, "learning_rate": 1e-06, "loss": 0.1299, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 4520.0, "completions/mean_length": 3052.28125, "completions/mean_terminated_length": 2074.062744140625, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "entropy": 0.2788449823856354, "epoch": 4.828525641025641, "frac_reward_zero_std": 0.15625, "grad_norm": 1405.5775146484375, "learning_rate": 1e-06, "loss": 0.07, "num_tokens": 1810095938.0, "reward": 0.3012896776199341, "reward_std": 0.07198358327150345, "rewards/progression_diversity/mean": -0.0009162867208942771, "rewards/progression_diversity/std": 0.006650157272815704, "rewards/symbolic_reward_accuracy/mean": 0.177734375, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.6703450679779053, "rewards/symbolic_reward_partial_score/std": 0.2553132474422455, "rewards/tag_count_reward/mean": -0.064453125, "rewards/tag_count_reward/std": 0.24579854309558868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0105712413787842, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 764.0, "sampling/sampling_logp_difference/mean": 21.2810001373291, "step": 3013 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.258184090256691, "epoch": 4.830128205128205, "grad_norm": 0.03618193790316582, "learning_rate": 1e-06, "loss": 0.184, "step": 3014 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2832891196012497, "epoch": 4.831730769230769, "grad_norm": 0.017958376556634903, "learning_rate": 1e-06, "loss": 0.0317, "step": 3015 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2775871604681015, "epoch": 4.833333333333333, "grad_norm": 0.02259541116654873, "learning_rate": 1e-06, "loss": 0.0932, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.107421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3844.0, "completions/mean_length": 3652.623046875, "completions/mean_terminated_length": 2120.400390625, "completions/min_length": 610.0, "completions/min_terminated_length": 610.0, "entropy": 0.25543099641799927, "epoch": 4.834935897435898, "frac_reward_zero_std": 0.25, "grad_norm": 3206.30419921875, "learning_rate": 1e-06, "loss": 0.1315, "num_tokens": 1812886417.0, "reward": 0.26881104707717896, "reward_std": 0.09635767340660095, "rewards/progression_diversity/mean": -0.0021946923807263374, "rewards/progression_diversity/std": 0.014525186270475388, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.6617349982261658, "rewards/symbolic_reward_partial_score/std": 0.27741891145706177, "rewards/tag_count_reward/mean": -0.09375, "rewards/tag_count_reward/std": 0.29176566004753113, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9844847321510315, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 35.239051818847656, "step": 3017 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2537519484758377, "epoch": 4.836538461538462, "grad_norm": 0.014091641642153263, "learning_rate": 1e-06, "loss": 0.1088, "step": 3018 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2543463036417961, "epoch": 4.8381410256410255, "grad_norm": 0.02137858420610428, "learning_rate": 1e-06, "loss": 0.148, "step": 3019 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.25083983689546585, "epoch": 4.839743589743589, "grad_norm": 0.021724727004766464, "learning_rate": 1e-06, "loss": 0.1358, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.111328125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3564.0, "completions/mean_length": 3640.005859375, "completions/mean_terminated_length": 2043.505615234375, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "entropy": 0.25516054779291153, "epoch": 4.841346153846154, "frac_reward_zero_std": 0.28125, "grad_norm": 753.2295532226562, "learning_rate": 1e-06, "loss": 0.0884, "num_tokens": 1815684788.0, "reward": 0.29512912034988403, "reward_std": 0.08512111753225327, "rewards/progression_diversity/mean": -0.0027145203202962875, "rewards/progression_diversity/std": 0.0170114878565073, "rewards/symbolic_reward_accuracy/mean": 0.171875, "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, "rewards/symbolic_reward_partial_score/mean": 0.6752604246139526, "rewards/symbolic_reward_partial_score/std": 0.29889386892318726, "rewards/tag_count_reward/mean": -0.10546875, "rewards/tag_count_reward/std": 0.3074568510055542, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.986849308013916, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 32.57376480102539, "step": 3021 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.23778339475393295, "epoch": 4.842948717948718, "grad_norm": 0.759289562702179, "learning_rate": 1e-06, "loss": 0.2053, "step": 3022 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.25857195258140564, "epoch": 4.844551282051282, "grad_norm": 0.02268078364431858, "learning_rate": 1e-06, "loss": 0.0626, "step": 3023 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.24314356595277786, "epoch": 4.846153846153846, "grad_norm": 0.016103658825159073, "learning_rate": 1e-06, "loss": 0.1707, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.087890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 4237.0, "completions/mean_length": 3289.69921875, "completions/mean_terminated_length": 2027.935791015625, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "entropy": 0.27912090718746185, "epoch": 4.847756410256411, "frac_reward_zero_std": 0.40625, "grad_norm": 1176.0621337890625, "learning_rate": 1e-06, "loss": 0.0734, "num_tokens": 1818186474.0, "reward": 0.2887709140777588, "reward_std": 0.047475919127464294, "rewards/progression_diversity/mean": -0.0008389006252400577, "rewards/progression_diversity/std": 0.008314850740134716, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.6767903566360474, "rewards/symbolic_reward_partial_score/std": 0.2791590392589569, "rewards/tag_count_reward/mean": -0.080078125, "rewards/tag_count_reward/std": 0.271679550409317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0138845443725586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 768.0, "sampling/sampling_logp_difference/mean": 23.615236282348633, "step": 3025 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.28166961669921875, "epoch": 4.8493589743589745, "grad_norm": 0.01939082331955433, "learning_rate": 1e-06, "loss": 0.0351, "step": 3026 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.25974585115909576, "epoch": 4.850961538461538, "grad_norm": 0.029129792004823685, "learning_rate": 1e-06, "loss": 0.1673, "step": 3027 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.27130499482154846, "epoch": 4.852564102564102, "grad_norm": 0.09306855499744415, "learning_rate": 1e-06, "loss": 0.0714, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 3610.380859375, "completions/mean_terminated_length": 2041.6907958984375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.2426222562789917, "epoch": 4.854166666666667, "frac_reward_zero_std": 0.25, "grad_norm": 1595.66259765625, "learning_rate": 1e-06, "loss": 0.126, "num_tokens": 1820930301.0, "reward": 0.30804911255836487, "reward_std": 0.07695259153842926, "rewards/progression_diversity/mean": -0.0027075535617768764, "rewards/progression_diversity/std": 0.02146134339272976, "rewards/symbolic_reward_accuracy/mean": 0.20703125, "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, "rewards/symbolic_reward_partial_score/mean": 0.6467121839523315, "rewards/symbolic_reward_partial_score/std": 0.3115384578704834, "rewards/tag_count_reward/mean": -0.1015625, "rewards/tag_count_reward/std": 0.30236753821372986, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9908959269523621, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 32.86491012573242, "step": 3029 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.24747144430875778, "epoch": 4.855769230769231, "grad_norm": 0.017448777332901955, "learning_rate": 1e-06, "loss": 0.1274, "step": 3030 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.26713868975639343, "epoch": 4.857371794871795, "grad_norm": 0.018266018480062485, "learning_rate": 1e-06, "loss": 0.0513, "step": 3031 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23238349705934525, "epoch": 4.858974358974359, "grad_norm": 0.013434065505862236, "learning_rate": 1e-06, "loss": 0.2029, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3829.0, "completions/mean_length": 3513.24609375, "completions/mean_terminated_length": 1995.733642578125, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "entropy": 0.2464209794998169, "epoch": 4.860576923076923, "frac_reward_zero_std": 0.28125, "grad_norm": 1448.568115234375, "learning_rate": 1e-06, "loss": 0.1083, "num_tokens": 1823646795.0, "reward": 0.3104338049888611, "reward_std": 0.06594209372997284, "rewards/progression_diversity/mean": -0.003005102276802063, "rewards/progression_diversity/std": 0.022076178342103958, "rewards/symbolic_reward_accuracy/mean": 0.18359375, "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, "rewards/symbolic_reward_partial_score/mean": 0.698291003704071, "rewards/symbolic_reward_partial_score/std": 0.2843267321586609, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9879831075668335, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 34.731117248535156, "step": 3033 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.23648971319198608, "epoch": 4.862179487179487, "grad_norm": 0.027491990476846695, "learning_rate": 1e-06, "loss": 0.1345, "step": 3034 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23159796744585037, "epoch": 4.863782051282051, "grad_norm": 0.27813470363616943, "learning_rate": 1e-06, "loss": 0.1389, "step": 3035 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.23856277763843536, "epoch": 4.865384615384615, "grad_norm": 0.04474368691444397, "learning_rate": 1e-06, "loss": 0.0929, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3863.0, "completions/mean_length": 3395.388671875, "completions/mean_terminated_length": 1927.11083984375, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "entropy": 0.24479106068611145, "epoch": 4.86698717948718, "frac_reward_zero_std": 0.15625, "grad_norm": 1553.0164794921875, "learning_rate": 1e-06, "loss": 0.1017, "num_tokens": 1826229362.0, "reward": 0.31477978825569153, "reward_std": 0.07595334947109222, "rewards/progression_diversity/mean": -0.0034673186019062996, "rewards/progression_diversity/std": 0.023447884246706963, "rewards/symbolic_reward_accuracy/mean": 0.208984375, "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, "rewards/symbolic_reward_partial_score/mean": 0.6639648079872131, "rewards/symbolic_reward_partial_score/std": 0.2900123596191406, "rewards/tag_count_reward/mean": -0.09765625, "rewards/tag_count_reward/std": 0.29713961482048035, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9932220578193665, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 33.091861724853516, "step": 3037 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.24616575241088867, "epoch": 4.868589743589744, "grad_norm": 0.10199563205242157, "learning_rate": 1e-06, "loss": 0.0912, "step": 3038 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.23579637706279755, "epoch": 4.8701923076923075, "grad_norm": 0.0868377834558487, "learning_rate": 1e-06, "loss": 0.1658, "step": 3039 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.23798397183418274, "epoch": 4.871794871794872, "grad_norm": 0.019305624067783356, "learning_rate": 1e-06, "loss": 0.1112, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3809.0, "completions/mean_length": 3376.041015625, "completions/mean_terminated_length": 1968.2532958984375, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "entropy": 0.2433418408036232, "epoch": 4.873397435897436, "frac_reward_zero_std": 0.21875, "grad_norm": 1277.455810546875, "learning_rate": 1e-06, "loss": 0.0845, "num_tokens": 1828798391.0, "reward": 0.32777589559555054, "reward_std": 0.09310010075569153, "rewards/progression_diversity/mean": -0.002196947578340769, "rewards/progression_diversity/std": 0.019314348697662354, "rewards/symbolic_reward_accuracy/mean": 0.224609375, "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, "rewards/symbolic_reward_partial_score/mean": 0.675341784954071, "rewards/symbolic_reward_partial_score/std": 0.3014916479587555, "rewards/tag_count_reward/mean": -0.095703125, "rewards/tag_count_reward/std": 0.2944713830947876, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9986719489097595, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 29.44231605529785, "step": 3041 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.25057369470596313, "epoch": 4.875, "grad_norm": 0.013811684213578701, "learning_rate": 1e-06, "loss": 0.0995, "step": 3042 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.24305298924446106, "epoch": 4.876602564102564, "grad_norm": 0.014420563355088234, "learning_rate": 1e-06, "loss": 0.1266, "step": 3043 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.24530383944511414, "epoch": 4.878205128205128, "grad_norm": 0.03273453935980797, "learning_rate": 1e-06, "loss": 0.0984, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3740.0, "completions/mean_length": 2730.166015625, "completions/mean_terminated_length": 1940.274658203125, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "entropy": 0.24772102385759354, "epoch": 4.8798076923076925, "frac_reward_zero_std": 0.3125, "grad_norm": 1146.00244140625, "learning_rate": 1e-06, "loss": 0.1173, "num_tokens": 1830977612.0, "reward": 0.41831594705581665, "reward_std": 0.06758980453014374, "rewards/progression_diversity/mean": -0.0014180454891175032, "rewards/progression_diversity/std": 0.010988151654601097, "rewards/symbolic_reward_accuracy/mean": 0.330078125, "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, "rewards/symbolic_reward_partial_score/mean": 0.7512043714523315, "rewards/symbolic_reward_partial_score/std": 0.2662599980831146, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0178977251052856, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 20.070125579833984, "step": 3045 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.2448933646082878, "epoch": 4.881410256410256, "grad_norm": 0.04779614508152008, "learning_rate": 1e-06, "loss": 0.1154, "step": 3046 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.26386718451976776, "epoch": 4.88301282051282, "grad_norm": 0.013240370899438858, "learning_rate": 1e-06, "loss": 0.0541, "step": 3047 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.25863589346408844, "epoch": 4.884615384615385, "grad_norm": 0.013798616826534271, "learning_rate": 1e-06, "loss": 0.0337, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3673.0, "completions/mean_length": 3313.994140625, "completions/mean_terminated_length": 1992.9398193359375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 0.23720625787973404, "epoch": 4.886217948717949, "frac_reward_zero_std": 0.34375, "grad_norm": 2632.1220703125, "learning_rate": 1e-06, "loss": 0.1328, "num_tokens": 1833494697.0, "reward": 0.3145332932472229, "reward_std": 0.07812836021184921, "rewards/progression_diversity/mean": -0.0022365176118910313, "rewards/progression_diversity/std": 0.015494197607040405, "rewards/symbolic_reward_accuracy/mean": 0.205078125, "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, "rewards/symbolic_reward_partial_score/mean": 0.6689616441726685, "rewards/symbolic_reward_partial_score/std": 0.2868025302886963, "rewards/tag_count_reward/mean": -0.091796875, "rewards/tag_count_reward/std": 0.289021372795105, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.003543734550476, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 26.09931182861328, "step": 3049 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2454667165875435, "epoch": 4.887820512820513, "grad_norm": 0.01591862179338932, "learning_rate": 1e-06, "loss": 0.0864, "step": 3050 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.24274562299251556, "epoch": 4.889423076923077, "grad_norm": 0.19580645859241486, "learning_rate": 1e-06, "loss": 0.0845, "step": 3051 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23570669442415237, "epoch": 4.891025641025641, "grad_norm": 0.03217773512005806, "learning_rate": 1e-06, "loss": 0.1089, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3641.0, "completions/mean_length": 2654.248046875, "completions/mean_terminated_length": 1979.0142822265625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.24544284492731094, "epoch": 4.892628205128205, "frac_reward_zero_std": 0.28125, "grad_norm": 1223.9429931640625, "learning_rate": 1e-06, "loss": 0.1096, "num_tokens": 1835634792.0, "reward": 0.47855299711227417, "reward_std": 0.09513897448778152, "rewards/progression_diversity/mean": -0.0026136531960219145, "rewards/progression_diversity/std": 0.020054563879966736, "rewards/symbolic_reward_accuracy/mean": 0.40625, "rewards/symbolic_reward_accuracy/std": 0.49161264300346375, "rewards/symbolic_reward_partial_score/mean": 0.7983887195587158, "rewards/symbolic_reward_partial_score/std": 0.2503470480442047, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.015138864517212, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 20.58639144897461, "step": 3053 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.25392650067806244, "epoch": 4.894230769230769, "grad_norm": 0.018817421048879623, "learning_rate": 1e-06, "loss": 0.0569, "step": 3054 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2459840551018715, "epoch": 4.895833333333333, "grad_norm": 0.0089957220479846, "learning_rate": 1e-06, "loss": 0.0989, "step": 3055 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.24934467673301697, "epoch": 4.897435897435898, "grad_norm": 0.02581534907221794, "learning_rate": 1e-06, "loss": 0.0898, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.115234375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3635.0, "completions/mean_length": 3645.541015625, "completions/mean_terminated_length": 1986.4481201171875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "entropy": 0.23428595066070557, "epoch": 4.899038461538462, "frac_reward_zero_std": 0.15625, "grad_norm": 1076.086669921875, "learning_rate": 1e-06, "loss": 0.1253, "num_tokens": 1838393661.0, "reward": 0.22710196673870087, "reward_std": 0.09757393598556519, "rewards/progression_diversity/mean": -0.0007409834070131183, "rewards/progression_diversity/std": 0.006571199279278517, "rewards/symbolic_reward_accuracy/mean": 0.0859375, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.6222655773162842, "rewards/symbolic_reward_partial_score/std": 0.27893805503845215, "rewards/tag_count_reward/mean": -0.111328125, "rewards/tag_count_reward/std": 0.31484565138816833, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9937641620635986, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 29.560943603515625, "step": 3057 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2234666869044304, "epoch": 4.9006410256410255, "grad_norm": 0.025841714814305305, "learning_rate": 1e-06, "loss": 0.1654, "step": 3058 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.23846197873353958, "epoch": 4.902243589743589, "grad_norm": 0.01864476129412651, "learning_rate": 1e-06, "loss": 0.0935, "step": 3059 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.22488000988960266, "epoch": 4.903846153846154, "grad_norm": 0.20488475263118744, "learning_rate": 1e-06, "loss": 0.1767, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3840.0, "completions/mean_length": 2616.1171875, "completions/mean_terminated_length": 1968.5479736328125, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "entropy": 0.2517363280057907, "epoch": 4.905448717948718, "frac_reward_zero_std": 0.40625, "grad_norm": 162.09591674804688, "learning_rate": 1e-06, "loss": 0.0271, "num_tokens": 1840538281.0, "reward": 0.3412300944328308, "reward_std": 0.05017785727977753, "rewards/progression_diversity/mean": -4.024427471449599e-05, "rewards/progression_diversity/std": 0.0005465570138767362, "rewards/symbolic_reward_accuracy/mean": 0.21484375, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.7214192152023315, "rewards/symbolic_reward_partial_score/std": 0.2416561096906662, "rewards/tag_count_reward/mean": -0.041015625, "rewards/tag_count_reward/std": 0.19852031767368317, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0312939882278442, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 772.0, "sampling/sampling_logp_difference/mean": 12.591215133666992, "step": 3061 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.24662530422210693, "epoch": 4.907051282051282, "grad_norm": 3917.875732421875, "learning_rate": 1e-06, "loss": 0.147, "step": 3062 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2500121220946312, "epoch": 4.908653846153846, "grad_norm": 2733.346923828125, "learning_rate": 1e-06, "loss": 0.2713, "step": 3063 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23512417078018188, "epoch": 4.910256410256411, "grad_norm": 0.14125868678092957, "learning_rate": 1e-06, "loss": 0.1516, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.095703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3573.0, "completions/mean_length": 3311.6796875, "completions/mean_terminated_length": 1928.2159423828125, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "entropy": 0.2293601930141449, "epoch": 4.9118589743589745, "frac_reward_zero_std": 0.34375, "grad_norm": 1017.6434326171875, "learning_rate": 1e-06, "loss": 0.0998, "num_tokens": 1843132549.0, "reward": 0.29114842414855957, "reward_std": 0.08108247071504593, "rewards/progression_diversity/mean": -0.00039431118057109416, "rewards/progression_diversity/std": 0.004132647532969713, "rewards/symbolic_reward_accuracy/mean": 0.1640625, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.6723307371139526, "rewards/symbolic_reward_partial_score/std": 0.2803674042224884, "rewards/tag_count_reward/mean": -0.08984375, "rewards/tag_count_reward/std": 0.2862374484539032, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.004433274269104, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 24.679431915283203, "step": 3065 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.22065846621990204, "epoch": 4.913461538461538, "grad_norm": 0.017783688381314278, "learning_rate": 1e-06, "loss": 0.1363, "step": 3066 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.23686835914850235, "epoch": 4.915064102564102, "grad_norm": 0.03829358518123627, "learning_rate": 1e-06, "loss": 0.0933, "step": 3067 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22281410545110703, "epoch": 4.916666666666667, "grad_norm": 0.013630473986268044, "learning_rate": 1e-06, "loss": 0.1489, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3399.0, "completions/mean_length": 3175.4453125, "completions/mean_terminated_length": 1933.615478515625, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "entropy": 0.23225311189889908, "epoch": 4.918269230769231, "frac_reward_zero_std": 0.34375, "grad_norm": 1236.8885498046875, "learning_rate": 1e-06, "loss": 0.0914, "num_tokens": 1845645129.0, "reward": 0.29391324520111084, "reward_std": 0.04953677952289581, "rewards/progression_diversity/mean": -0.0002773654996417463, "rewards/progression_diversity/std": 0.003748962190002203, "rewards/symbolic_reward_accuracy/mean": 0.15625, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.6932617425918579, "rewards/symbolic_reward_partial_score/std": 0.2669588029384613, "rewards/tag_count_reward/mean": -0.078125, "rewards/tag_count_reward/std": 0.26863065361976624, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.011780023574829, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 22.082910537719727, "step": 3069 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.22503124177455902, "epoch": 4.919871794871795, "grad_norm": 0.12676434218883514, "learning_rate": 1e-06, "loss": 0.1061, "step": 3070 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.22606287896633148, "epoch": 4.921474358974359, "grad_norm": 0.02233606018126011, "learning_rate": 1e-06, "loss": 0.106, "step": 3071 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2343372032046318, "epoch": 4.923076923076923, "grad_norm": 0.014616597443819046, "learning_rate": 1e-06, "loss": 0.1043, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.087890625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3779.0, "completions/mean_length": 3257.8828125, "completions/mean_terminated_length": 1993.053466796875, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "entropy": 0.21944886445999146, "epoch": 4.924679487179487, "frac_reward_zero_std": 0.15625, "grad_norm": 1409.2808837890625, "learning_rate": 1e-06, "loss": 0.1516, "num_tokens": 1848203293.0, "reward": 0.21257862448692322, "reward_std": 0.07666586339473724, "rewards/progression_diversity/mean": -0.00044064479880034924, "rewards/progression_diversity/std": 0.003953089937567711, "rewards/symbolic_reward_accuracy/mean": 0.06640625, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.6037923097610474, "rewards/symbolic_reward_partial_score/std": 0.2407861053943634, "rewards/tag_count_reward/mean": -0.083984375, "rewards/tag_count_reward/std": 0.2776356339454651, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995574355125427, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 28.18425941467285, "step": 3073 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2246202975511551, "epoch": 4.926282051282051, "grad_norm": 0.31535470485687256, "learning_rate": 1e-06, "loss": 0.1254, "step": 3074 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.23829004913568497, "epoch": 4.927884615384615, "grad_norm": 0.02114877477288246, "learning_rate": 1e-06, "loss": 0.0652, "step": 3075 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.22181038558483124, "epoch": 4.92948717948718, "grad_norm": 0.021001050248742104, "learning_rate": 1e-06, "loss": 0.1621, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3972.0, "completions/mean_length": 2757.482421875, "completions/mean_terminated_length": 1998.892822265625, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "entropy": 0.2369692623615265, "epoch": 4.931089743589744, "frac_reward_zero_std": 0.34375, "grad_norm": 1576.586181640625, "learning_rate": 1e-06, "loss": 0.1226, "num_tokens": 1850362948.0, "reward": 0.3155801594257355, "reward_std": 0.0678568109869957, "rewards/progression_diversity/mean": -9.189260163111612e-05, "rewards/progression_diversity/std": 0.0013428285019472241, "rewards/symbolic_reward_accuracy/mean": 0.177734375, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.7120931148529053, "rewards/symbolic_reward_partial_score/std": 0.23257166147232056, "rewards/tag_count_reward/mean": -0.046875, "rewards/tag_count_reward/std": 0.21157780289649963, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.022636890411377, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 18.015220642089844, "step": 3077 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.2449949011206627, "epoch": 4.9326923076923075, "grad_norm": 986.05517578125, "learning_rate": 1e-06, "loss": 0.1112, "step": 3078 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2491198480129242, "epoch": 4.934294871794872, "grad_norm": 0.01813652738928795, "learning_rate": 1e-06, "loss": 0.0695, "step": 3079 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.24884822964668274, "epoch": 4.935897435897436, "grad_norm": 0.030230067670345306, "learning_rate": 1e-06, "loss": 0.0596, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3653.0, "completions/mean_length": 3115.162109375, "completions/mean_terminated_length": 2051.41552734375, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "entropy": 0.2273581624031067, "epoch": 4.9375, "frac_reward_zero_std": 0.1875, "grad_norm": 1620.06494140625, "learning_rate": 1e-06, "loss": 0.1231, "num_tokens": 1852808343.0, "reward": 0.3891579508781433, "reward_std": 0.09541543573141098, "rewards/progression_diversity/mean": -0.0007119444198906422, "rewards/progression_diversity/std": 0.004969864152371883, "rewards/symbolic_reward_accuracy/mean": 0.298828125, "rewards/symbolic_reward_accuracy/std": 0.45819199085235596, "rewards/symbolic_reward_partial_score/mean": 0.7229980230331421, "rewards/symbolic_reward_partial_score/std": 0.27403724193573, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0014233589172363, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 27.424591064453125, "step": 3081 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22573982179164886, "epoch": 4.939102564102564, "grad_norm": 0.013195838779211044, "learning_rate": 1e-06, "loss": 0.1688, "step": 3082 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.22488176822662354, "epoch": 4.940705128205128, "grad_norm": 0.01642797514796257, "learning_rate": 1e-06, "loss": 0.1161, "step": 3083 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.22580020129680634, "epoch": 4.9423076923076925, "grad_norm": 0.030097253620624542, "learning_rate": 1e-06, "loss": 0.1038, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3909.0, "completions/mean_length": 2938.462890625, "completions/mean_terminated_length": 2012.1524658203125, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "entropy": 0.24025756865739822, "epoch": 4.943910256410256, "frac_reward_zero_std": 0.3125, "grad_norm": 427.4413757324219, "learning_rate": 1e-06, "loss": 0.0368, "num_tokens": 1855159284.0, "reward": 0.3869500756263733, "reward_std": 0.09471500664949417, "rewards/progression_diversity/mean": -0.0007927106926217675, "rewards/progression_diversity/std": 0.0072387754917144775, "rewards/symbolic_reward_accuracy/mean": 0.28515625, "rewards/symbolic_reward_accuracy/std": 0.45193037390708923, "rewards/symbolic_reward_partial_score/mean": 0.7403808832168579, "rewards/symbolic_reward_partial_score/std": 0.2626368999481201, "rewards/tag_count_reward/mean": -0.0625, "rewards/tag_count_reward/std": 0.2422981858253479, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.011838674545288, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 22.941444396972656, "step": 3085 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2293272241950035, "epoch": 4.94551282051282, "grad_norm": 0.014672359451651573, "learning_rate": 1e-06, "loss": 0.0897, "step": 3086 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.23675543069839478, "epoch": 4.947115384615385, "grad_norm": 0.03274073079228401, "learning_rate": 1e-06, "loss": 2.9293, "step": 3087 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.21905168890953064, "epoch": 4.948717948717949, "grad_norm": 0.013550637289881706, "learning_rate": 1e-06, "loss": 0.181, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.044921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3887.0, "completions/mean_length": 2637.939453125, "completions/mean_terminated_length": 1991.396728515625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "entropy": 0.23665142059326172, "epoch": 4.950320512820513, "frac_reward_zero_std": 0.28125, "grad_norm": 1990.0498046875, "learning_rate": 1e-06, "loss": 0.1275, "num_tokens": 1857301445.0, "reward": 0.35283172130584717, "reward_std": 0.07654442638158798, "rewards/progression_diversity/mean": -0.0010098177008330822, "rewards/progression_diversity/std": 0.007632537744939327, "rewards/symbolic_reward_accuracy/mean": 0.23828125, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.7145507335662842, "rewards/symbolic_reward_partial_score/std": 0.2468079924583435, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0146788358688354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 22.055877685546875, "step": 3089 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2322361320257187, "epoch": 4.951923076923077, "grad_norm": 0.017291007563471794, "learning_rate": 1e-06, "loss": 0.1592, "step": 3090 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.24952280521392822, "epoch": 4.953525641025641, "grad_norm": 0.010401350446045399, "learning_rate": 1e-06, "loss": 0.0266, "step": 3091 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2399914711713791, "epoch": 4.955128205128205, "grad_norm": 0.35759103298187256, "learning_rate": 1e-06, "loss": 0.0707, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.076171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3963.0, "completions/mean_length": 3135.34375, "completions/mean_terminated_length": 2042.9598388671875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "entropy": 0.2194850966334343, "epoch": 4.956730769230769, "frac_reward_zero_std": 0.1875, "grad_norm": 1517.0322265625, "learning_rate": 1e-06, "loss": 0.2334, "num_tokens": 1859834453.0, "reward": 0.2989712357521057, "reward_std": 0.10124389827251434, "rewards/progression_diversity/mean": -0.0013144873082637787, "rewards/progression_diversity/std": 0.011923530139029026, "rewards/symbolic_reward_accuracy/mean": 0.185546875, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.6483073234558105, "rewards/symbolic_reward_partial_score/std": 0.26260486245155334, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9993857145309448, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 29.454757690429688, "step": 3093 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.23910460621118546, "epoch": 4.958333333333333, "grad_norm": 0.11438003182411194, "learning_rate": 1e-06, "loss": 0.0364, "step": 3094 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.2314603254199028, "epoch": 4.959935897435898, "grad_norm": 0.013372881338000298, "learning_rate": 1e-06, "loss": 0.1093, "step": 3095 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.22888445109128952, "epoch": 4.961538461538462, "grad_norm": 0.04182210564613342, "learning_rate": 1e-06, "loss": 0.1042, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3800.0, "completions/mean_length": 3504.990234375, "completions/mean_terminated_length": 2049.10205078125, "completions/min_length": 499.0, "completions/min_terminated_length": 499.0, "entropy": 0.21291837096214294, "epoch": 4.9631410256410255, "frac_reward_zero_std": 0.21875, "grad_norm": 1515.2686767578125, "learning_rate": 1e-06, "loss": 0.1687, "num_tokens": 1862539136.0, "reward": 0.38587409257888794, "reward_std": 0.12614938616752625, "rewards/progression_diversity/mean": -0.0014604174066334963, "rewards/progression_diversity/std": 0.010783915407955647, "rewards/symbolic_reward_accuracy/mean": 0.310546875, "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, "rewards/symbolic_reward_partial_score/mean": 0.6984049677848816, "rewards/symbolic_reward_partial_score/std": 0.3126681447029114, "rewards/tag_count_reward/mean": -0.099609375, "rewards/tag_count_reward/std": 0.29977133870124817, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9896366000175476, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 34.82575607299805, "step": 3097 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.23361308127641678, "epoch": 4.964743589743589, "grad_norm": 99630.9453125, "learning_rate": 1e-06, "loss": 1.774, "step": 3098 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.21884525567293167, "epoch": 4.966346153846154, "grad_norm": 0.037266023457050323, "learning_rate": 1e-06, "loss": 0.1422, "step": 3099 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.21776709705591202, "epoch": 4.967948717948718, "grad_norm": 0.022990232333540916, "learning_rate": 1e-06, "loss": 0.1704, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.07421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 3794.0, "completions/mean_length": 3065.734375, "completions/mean_terminated_length": 1998.0252685546875, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "entropy": 0.2290486916899681, "epoch": 4.969551282051282, "frac_reward_zero_std": 0.21875, "grad_norm": 1406.708984375, "learning_rate": 1e-06, "loss": 0.0795, "num_tokens": 1864989176.0, "reward": 0.27424493432044983, "reward_std": 0.08562411367893219, "rewards/progression_diversity/mean": -0.0012889985227957368, "rewards/progression_diversity/std": 0.011297719553112984, "rewards/symbolic_reward_accuracy/mean": 0.134765625, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.6680989265441895, "rewards/symbolic_reward_partial_score/std": 0.251154363155365, "rewards/tag_count_reward/mean": -0.0703125, "rewards/tag_count_reward/std": 0.25592297315597534, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.004550576210022, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 27.251415252685547, "step": 3101 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.21562334895133972, "epoch": 4.971153846153846, "grad_norm": 0.22579920291900635, "learning_rate": 1e-06, "loss": 0.1258, "step": 3102 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.22457308322191238, "epoch": 4.972756410256411, "grad_norm": 0.023017656058073044, "learning_rate": 1e-06, "loss": 423466112.0, "step": 3103 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2277710661292076, "epoch": 4.9743589743589745, "grad_norm": 0.0158623605966568, "learning_rate": 1e-06, "loss": 0.088, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3392.0, "completions/mean_length": 2962.814453125, "completions/mean_terminated_length": 1947.766845703125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "entropy": 0.22295588999986649, "epoch": 4.975961538461538, "frac_reward_zero_std": 0.28125, "grad_norm": 615.591796875, "learning_rate": 1e-06, "loss": 0.1338, "num_tokens": 1867381977.0, "reward": 0.31032562255859375, "reward_std": 0.08792345225811005, "rewards/progression_diversity/mean": -0.001620155293494463, "rewards/progression_diversity/std": 0.014947640709578991, "rewards/symbolic_reward_accuracy/mean": 0.16796875, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.7187174558639526, "rewards/symbolic_reward_partial_score/std": 0.2543519139289856, "rewards/tag_count_reward/mean": -0.060546875, "rewards/tag_count_reward/std": 0.2387305200099945, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0067986249923706, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 26.045305252075195, "step": 3105 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2358352467417717, "epoch": 4.977564102564102, "grad_norm": 0.02432454563677311, "learning_rate": 1e-06, "loss": 0.0836, "step": 3106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.22167550772428513, "epoch": 4.979166666666667, "grad_norm": 0.01078137755393982, "learning_rate": 1e-06, "loss": 0.1256, "step": 3107 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.22863591462373734, "epoch": 4.980769230769231, "grad_norm": 0.019278274849057198, "learning_rate": 1e-06, "loss": 0.0846, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.048828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 3294.0, "completions/mean_length": 2566.49609375, "completions/mean_terminated_length": 1857.1787109375, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "entropy": 0.25486068427562714, "epoch": 4.982371794871795, "frac_reward_zero_std": 0.34375, "grad_norm": 590.8904418945312, "learning_rate": 1e-06, "loss": 0.0226, "num_tokens": 1869483767.0, "reward": 0.3811664581298828, "reward_std": 0.06205814331769943, "rewards/progression_diversity/mean": -0.0015223543159663677, "rewards/progression_diversity/std": 0.01551423966884613, "rewards/symbolic_reward_accuracy/mean": 0.267578125, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.7497721314430237, "rewards/symbolic_reward_partial_score/std": 0.24262651801109314, "rewards/tag_count_reward/mean": -0.04296875, "rewards/tag_count_reward/std": 0.2029850035905838, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0239535570144653, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 18.67233657836914, "step": 3109 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.23615090548992157, "epoch": 4.983974358974359, "grad_norm": 37597.58984375, "learning_rate": 1e-06, "loss": 0.8181, "step": 3110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.24501872807741165, "epoch": 4.985576923076923, "grad_norm": 0.01006036065518856, "learning_rate": 1e-06, "loss": 0.0984, "step": 3111 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.24807747453451157, "epoch": 4.987179487179487, "grad_norm": 0.02072441577911377, "learning_rate": 1e-06, "loss": 0.0622, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.052734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 3321.0, "completions/mean_length": 2686.251953125, "completions/mean_terminated_length": 1923.697021484375, "completions/min_length": 427.0, "completions/min_terminated_length": 427.0, "entropy": 0.234246626496315, "epoch": 4.988782051282051, "frac_reward_zero_std": 0.28125, "grad_norm": 554.318603515625, "learning_rate": 1e-06, "loss": 0.0841, "num_tokens": 1871779112.0, "reward": 0.3497743010520935, "reward_std": 0.06554560363292694, "rewards/progression_diversity/mean": -0.0010880029294639826, "rewards/progression_diversity/std": 0.008479480631649494, "rewards/symbolic_reward_accuracy/mean": 0.236328125, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.7102213501930237, "rewards/symbolic_reward_partial_score/std": 0.2495347112417221, "rewards/tag_count_reward/mean": -0.05078125, "rewards/tag_count_reward/std": 0.21976542472839355, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0156009197235107, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 21.600927352905273, "step": 3113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2407403290271759, "epoch": 4.990384615384615, "grad_norm": 0.019321072846651077, "learning_rate": 1e-06, "loss": 0.0487, "step": 3114 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23200398683547974, "epoch": 4.99198717948718, "grad_norm": 0.015974221751093864, "learning_rate": 1e-06, "loss": 0.1028, "step": 3115 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.23310399055480957, "epoch": 4.993589743589744, "grad_norm": 0.016877546906471252, "learning_rate": 1e-06, "loss": 0.1081, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 2393.515625, "completions/mean_terminated_length": 1883.740966796875, "completions/min_length": 592.0, "completions/min_terminated_length": 592.0, "entropy": 0.24390202015638351, "epoch": 4.9951923076923075, "frac_reward_zero_std": 0.5, "grad_norm": 1346.7171630859375, "learning_rate": 1e-06, "loss": 0.0759, "num_tokens": 1873829408.0, "reward": 0.3640533685684204, "reward_std": 0.027338774874806404, "rewards/progression_diversity/mean": -0.0004282901354599744, "rewards/progression_diversity/std": 0.0046591991558671, "rewards/symbolic_reward_accuracy/mean": 0.25, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.7239420413970947, "rewards/symbolic_reward_partial_score/std": 0.23576416075229645, "rewards/tag_count_reward/mean": -0.03125, "rewards/tag_count_reward/std": 0.17416280508041382, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0342185497283936, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 776.0, "sampling/sampling_logp_difference/mean": 13.47022533416748, "step": 3117 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.24693401902914047, "epoch": 4.996794871794872, "grad_norm": 2261.0185546875, "learning_rate": 1e-06, "loss": 0.0913, "step": 3118 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24139049649238586, "epoch": 4.998397435897436, "grad_norm": 0.024692002683877945, "learning_rate": 1e-06, "loss": 0.0602, "step": 3119 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2525310665369034, "epoch": 5.0, "grad_norm": 0.013732725754380226, "learning_rate": 1e-06, "loss": 0.0156, "step": 3120 }, { "epoch": 5.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.063720703125, "eval_completions/max_length": 16002.59375, "eval_completions/max_terminated_length": 3275.65625, "eval_completions/mean_length": 2817.200927734375, "eval_completions/mean_terminated_length": 1894.647304534912, "eval_completions/min_length": 834.84375, "eval_completions/min_terminated_length": 834.84375, "eval_entropy": 0.22543939435854554, "eval_frac_reward_zero_std": 0.28125, "eval_loss": 0.04206552356481552, "eval_num_tokens": 1873829408.0, "eval_reward": 0.24393098056316376, "eval_reward_std": 0.05101948481751606, "eval_rewards/progression_diversity/mean": -0.001373334529262138, "eval_rewards/progression_diversity/std": 0.01101221157978216, "eval_rewards/symbolic_reward_accuracy/mean": 0.0966796875, "eval_rewards/symbolic_reward_accuracy/std": 0.21552498079836369, "eval_rewards/symbolic_reward_partial_score/mean": 0.6393208876252174, "eval_rewards/symbolic_reward_partial_score/std": 0.22751979576423764, "eval_rewards/tag_count_reward/mean": -0.05859375, "eval_rewards/tag_count_reward/std": 0.21680113021284342, "eval_runtime": 4356.8719, "eval_samples_per_second": 0.057, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0280762165784836, "eval_sampling/importance_sampling_ratio/min": 1.5225041253212082e-39, "eval_sampling/sampling_logp_difference/max": 754.4348351955414, "eval_sampling/sampling_logp_difference/mean": 16.028895314550027, "eval_steps_per_second": 0.0, "step": 3120 }, { "epoch": 5.0, "step": 3120, "total_flos": 0.0, "train_loss": 235461599.143655, "train_runtime": 175420.2215, "train_samples_per_second": 0.143, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 3120, "num_input_tokens_seen": 1873829408, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }