{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.197265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16276.0, "completions/mean_length": 6760.4375, "completions/mean_terminated_length": 4395.52294921875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "entropy": 0.2916708439588547, "epoch": 0.002631578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 0.035724807530641556, "learning_rate": 1e-06, "loss": 0.1316, "num_tokens": 3865152.0, "reward": 0.07896194607019424, "reward_std": 0.10644184798002243, "rewards/progression_diversity/mean": -0.009079374372959137, "rewards/progression_diversity/std": 0.053816426545381546, "rewards/symbolic_reward_accuracy/mean": 0.009765625, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.30517578125, "rewards/symbolic_reward_partial_score/std": 0.25365734100341797, "rewards/tag_count_reward/mean": -0.18359375, "rewards/tag_count_reward/std": 0.3875311613082886, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0618481636047363, "sampling/importance_sampling_ratio/min": 0.00013467908138409257, "sampling/sampling_logp_difference/max": 8.912615776062012, "sampling/sampling_logp_difference/mean": 0.10774467885494232, "step": 1 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2698264718055725, "epoch": 0.005263157894736842, "grad_norm": 0.032932084053754807, "learning_rate": 1e-06, "loss": 0.1813, "step": 2 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.26824212074279785, "epoch": 0.007894736842105263, "grad_norm": 0.0307021327316761, "learning_rate": 1e-06, "loss": 0.2115, "step": 3 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2804792523384094, "epoch": 0.010526315789473684, "grad_norm": 0.031359054148197174, "learning_rate": 1e-06, "loss": 0.1183, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.166015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15442.0, "completions/mean_length": 6071.873046875, "completions/mean_terminated_length": 4019.107666015625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "entropy": 0.28796158730983734, "epoch": 0.013157894736842105, "frac_reward_zero_std": 0.0, "grad_norm": 0.032425932586193085, "learning_rate": 1e-06, "loss": 0.1857, "num_tokens": 7368415.0, "reward": 0.08325499296188354, "reward_std": 0.0968296229839325, "rewards/progression_diversity/mean": -0.004579948727041483, "rewards/progression_diversity/std": 0.0396733395755291, "rewards/symbolic_reward_accuracy/mean": 0.005859375, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.3193359375, "rewards/symbolic_reward_partial_score/std": 0.2464565932750702, "rewards/tag_count_reward/mean": -0.16015625, "rewards/tag_count_reward/std": 0.3671095669269562, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0649909973144531, "sampling/importance_sampling_ratio/min": 4.260127752786502e-07, "sampling/sampling_logp_difference/max": 14.66879653930664, "sampling/sampling_logp_difference/mean": 0.11339153349399567, "step": 5 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2807093560695648, "epoch": 0.015789473684210527, "grad_norm": 0.02408398687839508, "learning_rate": 1e-06, "loss": 0.1071, "step": 6 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.28542497754096985, "epoch": 0.018421052631578946, "grad_norm": 0.0270535945892334, "learning_rate": 1e-06, "loss": 0.1096, "step": 7 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2942824959754944, "epoch": 0.021052631578947368, "grad_norm": 0.023121848702430725, "learning_rate": 1e-06, "loss": 0.2167, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.162109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15805.0, "completions/mean_length": 5919.259765625, "completions/mean_terminated_length": 3894.613037109375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.28526371717453003, "epoch": 0.02368421052631579, "frac_reward_zero_std": 0.0, "grad_norm": 0.033250465989112854, "learning_rate": 1e-06, "loss": 0.1934, "num_tokens": 10792196.0, "reward": 0.08689245581626892, "reward_std": 0.0983029454946518, "rewards/progression_diversity/mean": -0.007043677382171154, "rewards/progression_diversity/std": 0.04186973348259926, "rewards/symbolic_reward_accuracy/mean": 0.0078125, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.32568359375, "rewards/symbolic_reward_partial_score/std": 0.24731339514255524, "rewards/tag_count_reward/mean": -0.154296875, "rewards/tag_count_reward/std": 0.36158639192581177, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0634013414382935, "sampling/importance_sampling_ratio/min": 0.0004096508782822639, "sampling/sampling_logp_difference/max": 7.800205230712891, "sampling/sampling_logp_difference/mean": 0.11012717336416245, "step": 9 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2955394983291626, "epoch": 0.02631578947368421, "grad_norm": 0.02542749233543873, "learning_rate": 1e-06, "loss": 0.1529, "step": 10 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.28635716438293457, "epoch": 0.02894736842105263, "grad_norm": 0.02619725838303566, "learning_rate": 1e-06, "loss": 0.1198, "step": 11 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.28302156925201416, "epoch": 0.031578947368421054, "grad_norm": 0.022730253636837006, "learning_rate": 1e-06, "loss": 0.1629, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16322.0, "completions/mean_length": 6270.1875, "completions/mean_terminated_length": 3692.156982421875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.2731642723083496, "epoch": 0.034210526315789476, "frac_reward_zero_std": 0.0, "grad_norm": 0.030661821365356445, "learning_rate": 1e-06, "loss": 0.1548, "num_tokens": 14396676.0, "reward": 0.08138585090637207, "reward_std": 0.10238391160964966, "rewards/progression_diversity/mean": -0.00594722805544734, "rewards/progression_diversity/std": 0.03126208111643791, "rewards/symbolic_reward_accuracy/mean": 0.0078125, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.3177083432674408, "rewards/symbolic_reward_partial_score/std": 0.24931958317756653, "rewards/tag_count_reward/mean": -0.185546875, "rewards/tag_count_reward/std": 0.38912075757980347, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0586175918579102, "sampling/importance_sampling_ratio/min": 1.9201727354811737e-06, "sampling/sampling_logp_difference/max": 13.163095474243164, "sampling/sampling_logp_difference/mean": 0.10211914777755737, "step": 13 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.28947339951992035, "epoch": 0.03684210526315789, "grad_norm": 0.024929087609052658, "learning_rate": 1e-06, "loss": 0.1721, "step": 14 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.26907335221767426, "epoch": 0.039473684210526314, "grad_norm": 0.03776842728257179, "learning_rate": 1e-06, "loss": 0.1558, "step": 15 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.27320994436740875, "epoch": 0.042105263157894736, "grad_norm": 0.02968466654419899, "learning_rate": 1e-06, "loss": 0.2034, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.189453125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16088.0, "completions/mean_length": 6075.619140625, "completions/mean_terminated_length": 3666.1904296875, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "entropy": 0.2953624576330185, "epoch": 0.04473684210526316, "frac_reward_zero_std": 0.0, "grad_norm": 0.03018520586192608, "learning_rate": 1e-06, "loss": 0.106, "num_tokens": 17914529.0, "reward": 0.08459560573101044, "reward_std": 0.11007633805274963, "rewards/progression_diversity/mean": -0.007236707955598831, "rewards/progression_diversity/std": 0.039540816098451614, "rewards/symbolic_reward_accuracy/mean": 0.013671875, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.3193359375, "rewards/symbolic_reward_partial_score/std": 0.2542729675769806, "rewards/tag_count_reward/mean": -0.193359375, "rewards/tag_count_reward/std": 0.39531853795051575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059499979019165, "sampling/importance_sampling_ratio/min": 5.811744358652504e-06, "sampling/sampling_logp_difference/max": 12.05562973022461, "sampling/sampling_logp_difference/mean": 0.10546315461397171, "step": 17 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2861950248479843, "epoch": 0.04736842105263158, "grad_norm": 0.03124823048710823, "learning_rate": 1e-06, "loss": 0.1526, "step": 18 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.27276280522346497, "epoch": 0.05, "grad_norm": 0.027596795931458473, "learning_rate": 1e-06, "loss": 0.1598, "step": 19 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2472858428955078, "epoch": 0.05263157894736842, "grad_norm": 0.023518428206443787, "learning_rate": 1e-06, "loss": 0.2783, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.216796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14747.0, "completions/mean_length": 6167.828125, "completions/mean_terminated_length": 3339.910400390625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "entropy": 0.24753674864768982, "epoch": 0.05526315789473684, "frac_reward_zero_std": 0.0, "grad_norm": 0.033617325127124786, "learning_rate": 1e-06, "loss": 0.23, "num_tokens": 21476745.0, "reward": 0.08549793064594269, "reward_std": 0.09992466866970062, "rewards/progression_diversity/mean": -0.009777872823178768, "rewards/progression_diversity/std": 0.05313728749752045, "rewards/symbolic_reward_accuracy/mean": 0.0078125, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.33740234375, "rewards/symbolic_reward_partial_score/std": 0.24341119825839996, "rewards/tag_count_reward/mean": -0.203125, "rewards/tag_count_reward/std": 0.4027182459831238, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.055070161819458, "sampling/importance_sampling_ratio/min": 9.157028398476541e-05, "sampling/sampling_logp_difference/max": 9.2984037399292, "sampling/sampling_logp_difference/mean": 0.09585890173912048, "step": 21 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.26523521542549133, "epoch": 0.05789473684210526, "grad_norm": 0.02759598009288311, "learning_rate": 1e-06, "loss": 0.1624, "step": 22 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.28485071659088135, "epoch": 0.060526315789473685, "grad_norm": 0.029536111280322075, "learning_rate": 1e-06, "loss": 0.1678, "step": 23 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.274502232670784, "epoch": 0.06315789473684211, "grad_norm": 0.028490744531154633, "learning_rate": 1e-06, "loss": 0.1451, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.208984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14513.0, "completions/mean_length": 6365.98828125, "completions/mean_terminated_length": 3719.25439453125, "completions/min_length": 382.0, "completions/min_terminated_length": 382.0, "entropy": 0.25441184639930725, "epoch": 0.06578947368421052, "frac_reward_zero_std": 0.0, "grad_norm": 0.03975823521614075, "learning_rate": 1e-06, "loss": 0.2797, "num_tokens": 25147619.0, "reward": 0.086149662733078, "reward_std": 0.10140813887119293, "rewards/progression_diversity/mean": -0.00808083638548851, "rewards/progression_diversity/std": 0.04241418465971947, "rewards/symbolic_reward_accuracy/mean": 0.0078125, "rewards/symbolic_reward_accuracy/std": 0.08812850713729858, "rewards/symbolic_reward_partial_score/mean": 0.3395182490348816, "rewards/symbolic_reward_partial_score/std": 0.24165737628936768, "rewards/tag_count_reward/mean": -0.203125, "rewards/tag_count_reward/std": 0.4027182459831238, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0544767379760742, "sampling/importance_sampling_ratio/min": 3.484850741308776e-10, "sampling/sampling_logp_difference/max": 21.77742576599121, "sampling/sampling_logp_difference/mean": 0.09544065594673157, "step": 25 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.24812303483486176, "epoch": 0.06842105263157895, "grad_norm": 0.026934118941426277, "learning_rate": 1e-06, "loss": 0.2254, "step": 26 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2512124478816986, "epoch": 0.07105263157894737, "grad_norm": 0.03495849668979645, "learning_rate": 1e-06, "loss": 0.1695, "step": 27 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.28809893131256104, "epoch": 0.07368421052631578, "grad_norm": 0.02224835380911827, "learning_rate": 1e-06, "loss": 0.0802, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.169921875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15402.0, "completions/mean_length": 5648.064453125, "completions/mean_terminated_length": 3450.355224609375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.268292635679245, "epoch": 0.07631578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 0.032919030636548996, "learning_rate": 1e-06, "loss": 0.2178, "num_tokens": 28440228.0, "reward": 0.09082336723804474, "reward_std": 0.10907775163650513, "rewards/progression_diversity/mean": -0.0045784548856318, "rewards/progression_diversity/std": 0.02940884418785572, "rewards/symbolic_reward_accuracy/mean": 0.01171875, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.3361002802848816, "rewards/symbolic_reward_partial_score/std": 0.24807466566562653, "rewards/tag_count_reward/mean": -0.169921875, "rewards/tag_count_reward/std": 0.3759314715862274, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0602651834487915, "sampling/importance_sampling_ratio/min": 0.00284669641405344, "sampling/sampling_logp_difference/max": 5.86159610748291, "sampling/sampling_logp_difference/mean": 0.10585412383079529, "step": 29 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2800886482000351, "epoch": 0.07894736842105263, "grad_norm": 0.029134223237633705, "learning_rate": 1e-06, "loss": 0.1811, "step": 30 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2816573828458786, "epoch": 0.08157894736842106, "grad_norm": 0.033635616302490234, "learning_rate": 1e-06, "loss": 0.127, "step": 31 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2971540093421936, "epoch": 0.08421052631578947, "grad_norm": 0.023819033056497574, "learning_rate": 1e-06, "loss": 0.1382, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.173828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13264.0, "completions/mean_length": 5243.712890625, "completions/mean_terminated_length": 2899.775390625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.273567333817482, "epoch": 0.0868421052631579, "frac_reward_zero_std": 0.0, "grad_norm": 0.029521364718675613, "learning_rate": 1e-06, "loss": 0.1954, "num_tokens": 31522481.0, "reward": 0.10308191180229187, "reward_std": 0.1121080070734024, "rewards/progression_diversity/mean": -0.00919200200587511, "rewards/progression_diversity/std": 0.04991353675723076, "rewards/symbolic_reward_accuracy/mean": 0.015625, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.3712565004825592, "rewards/symbolic_reward_partial_score/std": 0.23796787858009338, "rewards/tag_count_reward/mean": -0.17578125, "rewards/tag_count_reward/std": 0.3810062110424042, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0566962957382202, "sampling/importance_sampling_ratio/min": 1.2478832331908052e-06, "sampling/sampling_logp_difference/max": 13.594061851501465, "sampling/sampling_logp_difference/mean": 0.09926008433103561, "step": 33 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2565145939588547, "epoch": 0.08947368421052632, "grad_norm": 0.04480559378862381, "learning_rate": 1e-06, "loss": 0.2772, "step": 34 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2905775457620621, "epoch": 0.09210526315789473, "grad_norm": 0.02144668437540531, "learning_rate": 1e-06, "loss": 0.0548, "step": 35 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.28261056542396545, "epoch": 0.09473684210526316, "grad_norm": 0.01856975071132183, "learning_rate": 1e-06, "loss": 0.1399, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.208984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15877.0, "completions/mean_length": 5636.087890625, "completions/mean_terminated_length": 2796.51611328125, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.27043572068214417, "epoch": 0.09736842105263158, "frac_reward_zero_std": 0.0, "grad_norm": 0.026228487491607666, "learning_rate": 1e-06, "loss": 0.1468, "num_tokens": 34785758.0, "reward": 0.09762196242809296, "reward_std": 0.12116604298353195, "rewards/progression_diversity/mean": -0.008311246521770954, "rewards/progression_diversity/std": 0.046763546764850616, "rewards/symbolic_reward_accuracy/mean": 0.01953125, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.35302734375, "rewards/symbolic_reward_partial_score/std": 0.24829283356666565, "rewards/tag_count_reward/mean": -0.19921875, "rewards/tag_count_reward/std": 0.39980348944664, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0502464771270752, "sampling/importance_sampling_ratio/min": 8.39968015498016e-06, "sampling/sampling_logp_difference/max": 11.68731689453125, "sampling/sampling_logp_difference/mean": 0.08828973770141602, "step": 37 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.26759083569049835, "epoch": 0.1, "grad_norm": 0.02601916529238224, "learning_rate": 1e-06, "loss": 0.1606, "step": 38 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2611129730939865, "epoch": 0.10263157894736842, "grad_norm": 0.02105417661368847, "learning_rate": 1e-06, "loss": 0.2227, "step": 39 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.23032942414283752, "epoch": 0.10526315789473684, "grad_norm": 0.02530878223478794, "learning_rate": 1e-06, "loss": 0.1808, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.23828125, "completions/max_length": 16384.0, "completions/max_terminated_length": 13911.0, "completions/mean_length": 5937.931640625, "completions/mean_terminated_length": 2670.187255859375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "entropy": 0.24540461599826813, "epoch": 0.10789473684210527, "frac_reward_zero_std": 0.0, "grad_norm": 0.02969300001859665, "learning_rate": 1e-06, "loss": 0.1987, "num_tokens": 38220283.0, "reward": 0.09507475793361664, "reward_std": 0.11622758209705353, "rewards/progression_diversity/mean": -0.014009159058332443, "rewards/progression_diversity/std": 0.06170212849974632, "rewards/symbolic_reward_accuracy/mean": 0.01953125, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.3531900942325592, "rewards/symbolic_reward_partial_score/std": 0.2495628297328949, "rewards/tag_count_reward/mean": -0.224609375, "rewards/tag_count_reward/std": 0.41773295402526855, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0466715097427368, "sampling/importance_sampling_ratio/min": 1.8073988030664623e-05, "sampling/sampling_logp_difference/max": 10.921036720275879, "sampling/sampling_logp_difference/mean": 0.08196896314620972, "step": 41 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.24679341912269592, "epoch": 0.11052631578947368, "grad_norm": 0.028460616245865822, "learning_rate": 1e-06, "loss": 0.234, "step": 42 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.23327426612377167, "epoch": 0.11315789473684211, "grad_norm": 0.030783407390117645, "learning_rate": 1e-06, "loss": 0.244, "step": 43 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2536231279373169, "epoch": 0.11578947368421053, "grad_norm": 0.023375684395432472, "learning_rate": 1e-06, "loss": 0.1264, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.224609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14565.0, "completions/mean_length": 5743.708984375, "completions/mean_terminated_length": 2661.5087890625, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.23496582359075546, "epoch": 0.11842105263157894, "frac_reward_zero_std": 0.0, "grad_norm": 0.02578672021627426, "learning_rate": 1e-06, "loss": 0.2409, "num_tokens": 41567718.0, "reward": 0.08303609490394592, "reward_std": 0.08830304443836212, "rewards/progression_diversity/mean": -0.011820437386631966, "rewards/progression_diversity/std": 0.057891786098480225, "rewards/symbolic_reward_accuracy/mean": 0.001953125, "rewards/symbolic_reward_accuracy/std": 0.04419417306780815, "rewards/symbolic_reward_partial_score/mean": 0.3468424677848816, "rewards/symbolic_reward_partial_score/std": 0.23540066182613373, "rewards/tag_count_reward/mean": -0.220703125, "rewards/tag_count_reward/std": 0.4151262938976288, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0478100776672363, "sampling/importance_sampling_ratio/min": 1.1134955002489733e-07, "sampling/sampling_logp_difference/max": 16.010591506958008, "sampling/sampling_logp_difference/mean": 0.08461372554302216, "step": 45 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.2394789233803749, "epoch": 0.12105263157894737, "grad_norm": 0.024003252387046814, "learning_rate": 1e-06, "loss": 0.1561, "step": 46 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.25297604501247406, "epoch": 0.12368421052631579, "grad_norm": 0.022317850962281227, "learning_rate": 1e-06, "loss": 0.1457, "step": 47 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2308100312948227, "epoch": 0.12631578947368421, "grad_norm": 0.027584845200181007, "learning_rate": 1e-06, "loss": 0.252, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 16384.0, "completions/mean_length": 5191.498046875, "completions/mean_terminated_length": 2474.871337890625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "entropy": 0.2666362375020981, "epoch": 0.12894736842105264, "frac_reward_zero_std": 0.03125, "grad_norm": 0.025992998853325844, "learning_rate": 1e-06, "loss": 0.1962, "num_tokens": 44623557.0, "reward": 0.10004402697086334, "reward_std": 0.10086282342672348, "rewards/progression_diversity/mean": -0.015129456296563148, "rewards/progression_diversity/std": 0.06190123409032822, "rewards/symbolic_reward_accuracy/mean": 0.013671875, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.3704427182674408, "rewards/symbolic_reward_partial_score/std": 0.23484936356544495, "rewards/tag_count_reward/mean": -0.19140625, "rewards/tag_count_reward/std": 0.3937928080558777, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.048860788345337, "sampling/importance_sampling_ratio/min": 2.264517434014124e-06, "sampling/sampling_logp_difference/max": 12.998148918151855, "sampling/sampling_logp_difference/mean": 0.08639080822467804, "step": 49 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2490471974015236, "epoch": 0.13157894736842105, "grad_norm": 0.021342728286981583, "learning_rate": 1e-06, "loss": 0.2177, "step": 50 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2422972321510315, "epoch": 0.13421052631578947, "grad_norm": 0.02412317879498005, "learning_rate": 1e-06, "loss": 0.2011, "step": 51 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.25571010261774063, "epoch": 0.1368421052631579, "grad_norm": 0.018414035439491272, "learning_rate": 1e-06, "loss": 0.1253, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.228515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15860.0, "completions/mean_length": 5547.751953125, "completions/mean_terminated_length": 2338.02783203125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "entropy": 0.222146138548851, "epoch": 0.1394736842105263, "frac_reward_zero_std": 0.0, "grad_norm": 0.02548670582473278, "learning_rate": 1e-06, "loss": 0.2067, "num_tokens": 47871142.0, "reward": 0.09103752672672272, "reward_std": 0.108045294880867, "rewards/progression_diversity/mean": -0.017341770231723785, "rewards/progression_diversity/std": 0.06780385971069336, "rewards/symbolic_reward_accuracy/mean": 0.009765625, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.3548177182674408, "rewards/symbolic_reward_partial_score/std": 0.23816773295402527, "rewards/tag_count_reward/mean": -0.2109375, "rewards/tag_count_reward/std": 0.4083731174468994, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041557788848877, "sampling/importance_sampling_ratio/min": 3.7289887586666737e-06, "sampling/sampling_logp_difference/max": 12.499373435974121, "sampling/sampling_logp_difference/mean": 0.07308885455131531, "step": 53 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.22613000124692917, "epoch": 0.14210526315789473, "grad_norm": 0.023181065917015076, "learning_rate": 1e-06, "loss": 0.2628, "step": 54 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.2536846548318863, "epoch": 0.14473684210526316, "grad_norm": 0.018339237198233604, "learning_rate": 1e-06, "loss": 0.0959, "step": 55 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.22903449833393097, "epoch": 0.14736842105263157, "grad_norm": 0.026370180770754814, "learning_rate": 1e-06, "loss": 0.2495, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.248046875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16286.0, "completions/mean_length": 5735.234375, "completions/mean_terminated_length": 2222.524658203125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.2157709002494812, "epoch": 0.15, "frac_reward_zero_std": 0.0, "grad_norm": 0.02145228534936905, "learning_rate": 1e-06, "loss": 0.1847, "num_tokens": 51218910.0, "reward": 0.09276480972766876, "reward_std": 0.1126212552189827, "rewards/progression_diversity/mean": -0.01062830537557602, "rewards/progression_diversity/std": 0.049262356013059616, "rewards/symbolic_reward_accuracy/mean": 0.013671875, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.3681640625, "rewards/symbolic_reward_partial_score/std": 0.23709788918495178, "rewards/tag_count_reward/mean": -0.2578125, "rewards/tag_count_reward/std": 0.43785804510116577, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.038439154624939, "sampling/importance_sampling_ratio/min": 1.4415817428670152e-09, "sampling/sampling_logp_difference/max": 20.357524871826172, "sampling/sampling_logp_difference/mean": 0.0689418762922287, "step": 57 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2207200676202774, "epoch": 0.15263157894736842, "grad_norm": 0.02863519825041294, "learning_rate": 1e-06, "loss": 0.2579, "step": 58 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.21071892976760864, "epoch": 0.15526315789473685, "grad_norm": 0.02227974310517311, "learning_rate": 1e-06, "loss": 0.2465, "step": 59 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23656637221574783, "epoch": 0.15789473684210525, "grad_norm": 0.01663924567401409, "learning_rate": 1e-06, "loss": 0.2336, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14423.0, "completions/mean_length": 5544.6953125, "completions/mean_terminated_length": 2080.587646484375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "entropy": 0.23988273739814758, "epoch": 0.16052631578947368, "frac_reward_zero_std": 0.0, "grad_norm": 0.027361398562788963, "learning_rate": 1e-06, "loss": 0.1632, "num_tokens": 54458434.0, "reward": 0.09711417555809021, "reward_std": 0.11729248613119125, "rewards/progression_diversity/mean": -0.010262314230203629, "rewards/progression_diversity/std": 0.05750845745205879, "rewards/symbolic_reward_accuracy/mean": 0.015625, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.3767903745174408, "rewards/symbolic_reward_partial_score/std": 0.2343655377626419, "rewards/tag_count_reward/mean": -0.251953125, "rewards/tag_count_reward/std": 0.43455907702445984, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0386930704116821, "sampling/importance_sampling_ratio/min": 2.470594381520641e-06, "sampling/sampling_logp_difference/max": 12.911051750183105, "sampling/sampling_logp_difference/mean": 0.0692630261182785, "step": 61 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.22496215999126434, "epoch": 0.1631578947368421, "grad_norm": 0.01953737623989582, "learning_rate": 1e-06, "loss": 0.2318, "step": 62 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.23897476494312286, "epoch": 0.16578947368421051, "grad_norm": 0.018175503239035606, "learning_rate": 1e-06, "loss": 0.1639, "step": 63 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.19628984481096268, "epoch": 0.16842105263157894, "grad_norm": 0.02357460930943489, "learning_rate": 1e-06, "loss": 0.3235, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.259765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15521.0, "completions/mean_length": 5778.126953125, "completions/mean_terminated_length": 2056.277099609375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "entropy": 0.22985464334487915, "epoch": 0.17105263157894737, "frac_reward_zero_std": 0.0, "grad_norm": 0.02823863923549652, "learning_rate": 1e-06, "loss": 0.2111, "num_tokens": 57850179.0, "reward": 0.08986608684062958, "reward_std": 0.10897202044725418, "rewards/progression_diversity/mean": -0.017297808080911636, "rewards/progression_diversity/std": 0.06803877651691437, "rewards/symbolic_reward_accuracy/mean": 0.009765625, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.365234375, "rewards/symbolic_reward_partial_score/std": 0.23282982409000397, "rewards/tag_count_reward/mean": -0.25390625, "rewards/tag_count_reward/std": 0.43567025661468506, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0341827869415283, "sampling/importance_sampling_ratio/min": 3.349011603859253e-05, "sampling/sampling_logp_difference/max": 10.30426025390625, "sampling/sampling_logp_difference/mean": 0.06156843155622482, "step": 65 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.22416777163743973, "epoch": 0.1736842105263158, "grad_norm": 0.026787322014570236, "learning_rate": 1e-06, "loss": 0.232, "step": 66 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.1903344914317131, "epoch": 0.1763157894736842, "grad_norm": 0.026732493191957474, "learning_rate": 1e-06, "loss": 0.286, "step": 67 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.1874106228351593, "epoch": 0.17894736842105263, "grad_norm": 0.02596643753349781, "learning_rate": 1e-06, "loss": 0.2322, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15584.0, "completions/mean_length": 5451.146484375, "completions/mean_terminated_length": 1496.7100830078125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "entropy": 0.1980566829442978, "epoch": 0.18157894736842106, "frac_reward_zero_std": 0.0, "grad_norm": 0.02380208671092987, "learning_rate": 1e-06, "loss": 0.2229, "num_tokens": 61031118.0, "reward": 0.09820634126663208, "reward_std": 0.12823785841464996, "rewards/progression_diversity/mean": -0.018233338370919228, "rewards/progression_diversity/std": 0.07207600027322769, "rewards/symbolic_reward_accuracy/mean": 0.01953125, "rewards/symbolic_reward_accuracy/std": 0.1385180652141571, "rewards/symbolic_reward_partial_score/mean": 0.37744140625, "rewards/symbolic_reward_partial_score/std": 0.23769913613796234, "rewards/tag_count_reward/mean": -0.265625, "rewards/tag_count_reward/std": 0.44209739565849304, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.030740737915039, "sampling/importance_sampling_ratio/min": 5.1614351832540706e-05, "sampling/sampling_logp_difference/max": 9.871710777282715, "sampling/sampling_logp_difference/mean": 0.055819302797317505, "step": 69 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2236703857779503, "epoch": 0.18421052631578946, "grad_norm": 0.018748750910162926, "learning_rate": 1e-06, "loss": 0.19, "step": 70 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.22987084090709686, "epoch": 0.1868421052631579, "grad_norm": 0.021655604243278503, "learning_rate": 1e-06, "loss": 0.1942, "step": 71 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.17527379840612411, "epoch": 0.18947368421052632, "grad_norm": 0.025045355781912804, "learning_rate": 1e-06, "loss": 0.2901, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15275.0, "completions/mean_length": 5332.80859375, "completions/mean_terminated_length": 1335.569091796875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.2162635251879692, "epoch": 0.19210526315789472, "frac_reward_zero_std": 0.0, "grad_norm": 0.02276972495019436, "learning_rate": 1e-06, "loss": 0.2146, "num_tokens": 64162156.0, "reward": 0.10593515634536743, "reward_std": 0.12418779730796814, "rewards/progression_diversity/mean": -0.021718915551900864, "rewards/progression_diversity/std": 0.07438741624355316, "rewards/symbolic_reward_accuracy/mean": 0.025390625, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.3896484375, "rewards/symbolic_reward_partial_score/std": 0.23622576892375946, "rewards/tag_count_reward/mean": -0.259765625, "rewards/tag_count_reward/std": 0.4389347732067108, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0349256992340088, "sampling/importance_sampling_ratio/min": 1.1178142813150771e-05, "sampling/sampling_logp_difference/max": 11.40155029296875, "sampling/sampling_logp_difference/mean": 0.06305186450481415, "step": 73 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.19458654522895813, "epoch": 0.19473684210526315, "grad_norm": 0.020284034311771393, "learning_rate": 1e-06, "loss": 0.2623, "step": 74 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.22814790159463882, "epoch": 0.19736842105263158, "grad_norm": 0.017509233206510544, "learning_rate": 1e-06, "loss": 0.1189, "step": 75 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.20635511726140976, "epoch": 0.2, "grad_norm": 0.017491551116108894, "learning_rate": 1e-06, "loss": 0.2417, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.263671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16298.0, "completions/mean_length": 5354.87109375, "completions/mean_terminated_length": 1405.4482421875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.20123746991157532, "epoch": 0.2026315789473684, "frac_reward_zero_std": 0.0, "grad_norm": 0.018690288066864014, "learning_rate": 1e-06, "loss": 0.1404, "num_tokens": 67315338.0, "reward": 0.09713856130838394, "reward_std": 0.11651948094367981, "rewards/progression_diversity/mean": -0.022472595795989037, "rewards/progression_diversity/std": 0.0805739313364029, "rewards/symbolic_reward_accuracy/mean": 0.015625, "rewards/symbolic_reward_accuracy/std": 0.12414088100194931, "rewards/symbolic_reward_partial_score/mean": 0.3818359375, "rewards/symbolic_reward_partial_score/std": 0.23188191652297974, "rewards/tag_count_reward/mean": -0.265625, "rewards/tag_count_reward/std": 0.44209739565849304, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0267584323883057, "sampling/importance_sampling_ratio/min": 6.460865731305887e-10, "sampling/sampling_logp_difference/max": 21.16008758544922, "sampling/sampling_logp_difference/mean": 0.04901731014251709, "step": 77 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.19783911854028702, "epoch": 0.20526315789473684, "grad_norm": 0.019168507307767868, "learning_rate": 1e-06, "loss": 0.2772, "step": 78 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.18744147568941116, "epoch": 0.20789473684210527, "grad_norm": 0.021555200219154358, "learning_rate": 1e-06, "loss": 0.2715, "step": 79 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.18239572644233704, "epoch": 0.21052631578947367, "grad_norm": 0.015542907640337944, "learning_rate": 1e-06, "loss": 0.2586, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.24609375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14582.0, "completions/mean_length": 4839.45703125, "completions/mean_terminated_length": 1071.031005859375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "entropy": 0.215584859251976, "epoch": 0.2131578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 0.016407115384936333, "learning_rate": 1e-06, "loss": 0.1701, "num_tokens": 70206132.0, "reward": 0.10850296914577484, "reward_std": 0.12094779312610626, "rewards/progression_diversity/mean": -0.018844161182641983, "rewards/progression_diversity/std": 0.07712793350219727, "rewards/symbolic_reward_accuracy/mean": 0.0234375, "rewards/symbolic_reward_accuracy/std": 0.15143637359142303, "rewards/symbolic_reward_partial_score/mean": 0.3974609375, "rewards/symbolic_reward_partial_score/std": 0.22982890903949738, "rewards/tag_count_reward/mean": -0.24609375, "rewards/tag_count_reward/std": 0.4311550557613373, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0279490947723389, "sampling/importance_sampling_ratio/min": 0.00020903853874187917, "sampling/sampling_logp_difference/max": 8.472991943359375, "sampling/sampling_logp_difference/mean": 0.05078686401247978, "step": 81 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.1952200084924698, "epoch": 0.21578947368421053, "grad_norm": 0.017001083120703697, "learning_rate": 1e-06, "loss": 0.267, "step": 82 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.1928211748600006, "epoch": 0.21842105263157896, "grad_norm": 0.017411494627594948, "learning_rate": 1e-06, "loss": 0.2582, "step": 83 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.22684676200151443, "epoch": 0.22105263157894736, "grad_norm": 0.014094403013586998, "learning_rate": 1e-06, "loss": 0.2321, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14469.0, "completions/mean_length": 3569.38671875, "completions/mean_terminated_length": 982.3990478515625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.24647942930459976, "epoch": 0.2236842105263158, "frac_reward_zero_std": 0.0, "grad_norm": 0.016102120280265808, "learning_rate": 1e-06, "loss": 0.1723, "num_tokens": 72421946.0, "reward": 0.11379130184650421, "reward_std": 0.10348815470933914, "rewards/progression_diversity/mean": -0.01735476776957512, "rewards/progression_diversity/std": 0.07398708909749985, "rewards/symbolic_reward_accuracy/mean": 0.013671875, "rewards/symbolic_reward_accuracy/std": 0.1162383034825325, "rewards/symbolic_reward_partial_score/mean": 0.4085286259651184, "rewards/symbolic_reward_partial_score/std": 0.21359524130821228, "rewards/tag_count_reward/mean": -0.16796875, "rewards/tag_count_reward/std": 0.374204158782959, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0335094928741455, "sampling/importance_sampling_ratio/min": 0.000267473777057603, "sampling/sampling_logp_difference/max": 8.226489067077637, "sampling/sampling_logp_difference/mean": 0.06112096086144447, "step": 85 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2344566285610199, "epoch": 0.22631578947368422, "grad_norm": 0.019014785066246986, "learning_rate": 1e-06, "loss": 0.2905, "step": 86 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.24500514566898346, "epoch": 0.22894736842105262, "grad_norm": 0.015879755839705467, "learning_rate": 1e-06, "loss": 0.2251, "step": 87 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.25136934220790863, "epoch": 0.23157894736842105, "grad_norm": 0.01475497055798769, "learning_rate": 1e-06, "loss": 0.1835, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.12109375, "completions/max_length": 16384.0, "completions/max_terminated_length": 15631.0, "completions/mean_length": 2819.962890625, "completions/mean_terminated_length": 951.1400146484375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "entropy": 0.24405641853809357, "epoch": 0.23421052631578948, "frac_reward_zero_std": 0.0, "grad_norm": 0.024288885295391083, "learning_rate": 1e-06, "loss": 0.212, "num_tokens": 74253031.0, "reward": 0.13387255370616913, "reward_std": 0.12053509056568146, "rewards/progression_diversity/mean": -0.011182607151567936, "rewards/progression_diversity/std": 0.06163937970995903, "rewards/symbolic_reward_accuracy/mean": 0.029296875, "rewards/symbolic_reward_accuracy/std": 0.16880230605602264, "rewards/symbolic_reward_partial_score/mean": 0.4270833134651184, "rewards/symbolic_reward_partial_score/std": 0.21671809256076813, "rewards/tag_count_reward/mean": -0.1171875, "rewards/tag_count_reward/std": 0.32195815443992615, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041147232055664, "sampling/importance_sampling_ratio/min": 0.0011610162910073996, "sampling/sampling_logp_difference/max": 6.758459568023682, "sampling/sampling_logp_difference/mean": 0.0764530748128891, "step": 89 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2542632222175598, "epoch": 0.23684210526315788, "grad_norm": 0.01583160273730755, "learning_rate": 1e-06, "loss": 0.1479, "step": 90 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.27341794967651367, "epoch": 0.2394736842105263, "grad_norm": 0.013665684498846531, "learning_rate": 1e-06, "loss": 0.1461, "step": 91 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.24389638006687164, "epoch": 0.24210526315789474, "grad_norm": 0.0186525397002697, "learning_rate": 1e-06, "loss": 0.1631, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 16204.0, "completions/mean_length": 3517.671875, "completions/mean_terminated_length": 992.504638671875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.23533198237419128, "epoch": 0.24473684210526317, "frac_reward_zero_std": 0.0, "grad_norm": 0.018597114831209183, "learning_rate": 1e-06, "loss": 0.1957, "num_tokens": 76442207.0, "reward": 0.1313730925321579, "reward_std": 0.13323497772216797, "rewards/progression_diversity/mean": -0.01698843017220497, "rewards/progression_diversity/std": 0.07352975755929947, "rewards/symbolic_reward_accuracy/mean": 0.03515625, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.4254557192325592, "rewards/symbolic_reward_partial_score/std": 0.2250940501689911, "rewards/tag_count_reward/mean": -0.171875, "rewards/tag_count_reward/std": 0.3776407241821289, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0345077514648438, "sampling/importance_sampling_ratio/min": 2.56181897384522e-06, "sampling/sampling_logp_difference/max": 12.87479305267334, "sampling/sampling_logp_difference/mean": 0.06349129974842072, "step": 93 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.24531496316194534, "epoch": 0.24736842105263157, "grad_norm": 0.015834983438253403, "learning_rate": 1e-06, "loss": 0.1729, "step": 94 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.21955663710832596, "epoch": 0.25, "grad_norm": 0.01730695739388466, "learning_rate": 1e-06, "loss": 0.2377, "step": 95 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.21611076593399048, "epoch": 0.25263157894736843, "grad_norm": 0.015181013382971287, "learning_rate": 1e-06, "loss": 0.1776, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15096.0, "completions/mean_length": 2637.765625, "completions/mean_terminated_length": 813.0442504882812, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "entropy": 0.25482119619846344, "epoch": 0.25526315789473686, "frac_reward_zero_std": 0.0, "grad_norm": 0.014081789180636406, "learning_rate": 1e-06, "loss": 0.1076, "num_tokens": 78194887.0, "reward": 0.14719998836517334, "reward_std": 0.14902149140834808, "rewards/progression_diversity/mean": -0.011447252705693245, "rewards/progression_diversity/std": 0.06131342053413391, "rewards/symbolic_reward_accuracy/mean": 0.046875, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.4383138120174408, "rewards/symbolic_reward_partial_score/std": 0.22779904305934906, "rewards/tag_count_reward/mean": -0.123046875, "rewards/tag_count_reward/std": 0.32881227135658264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0369794368743896, "sampling/importance_sampling_ratio/min": 3.3109614605564275e-08, "sampling/sampling_logp_difference/max": 17.22344207763672, "sampling/sampling_logp_difference/mean": 0.06858888268470764, "step": 97 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.24463753402233124, "epoch": 0.2578947368421053, "grad_norm": 0.06137360259890556, "learning_rate": 1e-06, "loss": 0.1628, "step": 98 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.24564764648675919, "epoch": 0.26052631578947366, "grad_norm": 0.01690017245709896, "learning_rate": 1e-06, "loss": 0.1851, "step": 99 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.24623940885066986, "epoch": 0.2631578947368421, "grad_norm": 0.013457868248224258, "learning_rate": 1e-06, "loss": 0.1721, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.10546875, "completions/max_length": 16384.0, "completions/max_terminated_length": 15936.0, "completions/mean_length": 2532.294921875, "completions/mean_terminated_length": 899.1244506835938, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "entropy": 0.24528837949037552, "epoch": 0.2657894736842105, "frac_reward_zero_std": 0.03125, "grad_norm": 0.014075527898967266, "learning_rate": 1e-06, "loss": 0.1249, "num_tokens": 79901246.0, "reward": 0.1632891744375229, "reward_std": 0.15703746676445007, "rewards/progression_diversity/mean": -0.008974803611636162, "rewards/progression_diversity/std": 0.05761713534593582, "rewards/symbolic_reward_accuracy/mean": 0.068359375, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.4469400942325592, "rewards/symbolic_reward_partial_score/std": 0.24197350442409515, "rewards/tag_count_reward/mean": -0.1171875, "rewards/tag_count_reward/std": 0.32195815443992615, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.041948676109314, "sampling/importance_sampling_ratio/min": 1.5186431001623646e-09, "sampling/sampling_logp_difference/max": 20.305448532104492, "sampling/sampling_logp_difference/mean": 0.07734344899654388, "step": 101 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.26768821477890015, "epoch": 0.26842105263157895, "grad_norm": 0.014358071610331535, "learning_rate": 1e-06, "loss": 0.125, "step": 102 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2417154163122177, "epoch": 0.2710526315789474, "grad_norm": 0.019036587327718735, "learning_rate": 1e-06, "loss": 0.221, "step": 103 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.26473505795001984, "epoch": 0.2736842105263158, "grad_norm": 0.014921830035746098, "learning_rate": 1e-06, "loss": 0.073, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.080078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 5375.0, "completions/mean_length": 1962.900390625, "completions/mean_terminated_length": 707.560546875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.2539156824350357, "epoch": 0.27631578947368424, "frac_reward_zero_std": 0.0, "grad_norm": 0.01651758700609207, "learning_rate": 1e-06, "loss": 0.1921, "num_tokens": 81315211.0, "reward": 0.15442626178264618, "reward_std": 0.14438994228839874, "rewards/progression_diversity/mean": -0.011476235464215279, "rewards/progression_diversity/std": 0.06588800996541977, "rewards/symbolic_reward_accuracy/mean": 0.044921875, "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, "rewards/symbolic_reward_partial_score/mean": 0.4532877802848816, "rewards/symbolic_reward_partial_score/std": 0.21151751279830933, "rewards/tag_count_reward/mean": -0.083984375, "rewards/tag_count_reward/std": 0.2776356339454651, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0463956594467163, "sampling/importance_sampling_ratio/min": 1.2895628742626286e-07, "sampling/sampling_logp_difference/max": 15.863792419433594, "sampling/sampling_logp_difference/mean": 0.08601226657629013, "step": 105 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.26433275640010834, "epoch": 0.2789473684210526, "grad_norm": 0.012559227645397186, "learning_rate": 1e-06, "loss": 0.154, "step": 106 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.27126292884349823, "epoch": 0.28157894736842104, "grad_norm": 0.011530612595379353, "learning_rate": 1e-06, "loss": 0.0815, "step": 107 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.28323958814144135, "epoch": 0.28421052631578947, "grad_norm": 0.015649767592549324, "learning_rate": 1e-06, "loss": 0.115, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.083984375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16114.0, "completions/mean_length": 2091.642578125, "completions/mean_terminated_length": 781.255859375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "entropy": 0.2563259154558182, "epoch": 0.2868421052631579, "frac_reward_zero_std": 0.0625, "grad_norm": 0.01467968337237835, "learning_rate": 1e-06, "loss": 0.0872, "num_tokens": 82791284.0, "reward": 0.17522379755973816, "reward_std": 0.14781689643859863, "rewards/progression_diversity/mean": -0.0069169411435723305, "rewards/progression_diversity/std": 0.05124044418334961, "rewards/symbolic_reward_accuracy/mean": 0.06640625, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.4807942509651184, "rewards/symbolic_reward_partial_score/std": 0.20960326492786407, "rewards/tag_count_reward/mean": -0.087890625, "rewards/tag_count_reward/std": 0.2834126651287079, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0412476062774658, "sampling/importance_sampling_ratio/min": 0.0006217172485776246, "sampling/sampling_logp_difference/max": 7.383025169372559, "sampling/sampling_logp_difference/mean": 0.07769767940044403, "step": 109 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.27861388027668, "epoch": 0.2894736842105263, "grad_norm": 0.012335779145359993, "learning_rate": 1e-06, "loss": 0.0808, "step": 110 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.2279139682650566, "epoch": 0.29210526315789476, "grad_norm": 0.01729213260114193, "learning_rate": 1e-06, "loss": 0.1974, "step": 111 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.28070947527885437, "epoch": 0.29473684210526313, "grad_norm": 0.014035679399967194, "learning_rate": 1e-06, "loss": 0.06, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068359375, "completions/max_length": 16384.0, "completions/max_terminated_length": 16285.0, "completions/mean_length": 1909.013671875, "completions/mean_terminated_length": 846.90771484375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "entropy": 0.2596060037612915, "epoch": 0.29736842105263156, "frac_reward_zero_std": 0.03125, "grad_norm": 0.011908638291060925, "learning_rate": 1e-06, "loss": 0.1146, "num_tokens": 84171323.0, "reward": 0.19407010078430176, "reward_std": 0.17639988660812378, "rewards/progression_diversity/mean": -0.011936957947909832, "rewards/progression_diversity/std": 0.07680150121450424, "rewards/symbolic_reward_accuracy/mean": 0.091796875, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.48779296875, "rewards/symbolic_reward_partial_score/std": 0.23039613664150238, "rewards/tag_count_reward/mean": -0.072265625, "rewards/tag_count_reward/std": 0.2591804563999176, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0448040962219238, "sampling/importance_sampling_ratio/min": 0.0012326554860919714, "sampling/sampling_logp_difference/max": 6.69858455657959, "sampling/sampling_logp_difference/mean": 0.08408962190151215, "step": 113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.23410920798778534, "epoch": 0.3, "grad_norm": 0.017887957394123077, "learning_rate": 1e-06, "loss": 0.229, "step": 114 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2956717312335968, "epoch": 0.3026315789473684, "grad_norm": 0.013839812017977238, "learning_rate": 1e-06, "loss": 0.0347, "step": 115 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.27330218255519867, "epoch": 0.30526315789473685, "grad_norm": 0.011987395584583282, "learning_rate": 1e-06, "loss": 0.0768, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13832.0, "completions/mean_length": 1699.599609375, "completions/mean_terminated_length": 720.6396484375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "entropy": 0.26409123837947845, "epoch": 0.3078947368421053, "frac_reward_zero_std": 0.03125, "grad_norm": 0.017784157767891884, "learning_rate": 1e-06, "loss": 0.1191, "num_tokens": 85467502.0, "reward": 0.19023296236991882, "reward_std": 0.1661926507949829, "rewards/progression_diversity/mean": -0.005026046186685562, "rewards/progression_diversity/std": 0.033581074327230453, "rewards/symbolic_reward_accuracy/mean": 0.0859375, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.4851887822151184, "rewards/symbolic_reward_partial_score/std": 0.23089143633842468, "rewards/tag_count_reward/mean": -0.068359375, "rewards/tag_count_reward/std": 0.25260838866233826, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0457016229629517, "sampling/importance_sampling_ratio/min": 1.8673212025532848e-06, "sampling/sampling_logp_difference/max": 13.19100570678711, "sampling/sampling_logp_difference/mean": 0.0859871432185173, "step": 117 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2728601396083832, "epoch": 0.3105263157894737, "grad_norm": 0.016618477180600166, "learning_rate": 1e-06, "loss": 0.1202, "step": 118 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2710384130477905, "epoch": 0.3131578947368421, "grad_norm": 0.015051553957164288, "learning_rate": 1e-06, "loss": 0.1286, "step": 119 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.281009241938591, "epoch": 0.3157894736842105, "grad_norm": 0.01171377208083868, "learning_rate": 1e-06, "loss": 0.0637, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03515625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 1197.052734375, "completions/mean_terminated_length": 643.6821899414062, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.2854345440864563, "epoch": 0.31842105263157894, "frac_reward_zero_std": 0.03125, "grad_norm": 0.016470612958073616, "learning_rate": 1e-06, "loss": 0.0497, "num_tokens": 86488713.0, "reward": 0.19141869246959686, "reward_std": 0.158242329955101, "rewards/progression_diversity/mean": -0.00363965705037117, "rewards/progression_diversity/std": 0.03781190514564514, "rewards/symbolic_reward_accuracy/mean": 0.080078125, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.4930012822151184, "rewards/symbolic_reward_partial_score/std": 0.2159450650215149, "rewards/tag_count_reward/mean": -0.044921875, "rewards/tag_count_reward/std": 0.20733514428138733, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0579612255096436, "sampling/importance_sampling_ratio/min": 0.0007034747395664454, "sampling/sampling_logp_difference/max": 7.259478569030762, "sampling/sampling_logp_difference/mean": 0.10833683609962463, "step": 121 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2941920757293701, "epoch": 0.32105263157894737, "grad_norm": 0.01042513083666563, "learning_rate": 1e-06, "loss": 0.0543, "step": 122 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.29098397493362427, "epoch": 0.3236842105263158, "grad_norm": 0.012190199457108974, "learning_rate": 1e-06, "loss": 0.0446, "step": 123 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.27491484582424164, "epoch": 0.3263157894736842, "grad_norm": 0.011241449974477291, "learning_rate": 1e-06, "loss": 0.0765, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.029296875, "completions/max_length": 16384.0, "completions/max_terminated_length": 9764.0, "completions/mean_length": 1113.736328125, "completions/mean_terminated_length": 652.8631591796875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.276974618434906, "epoch": 0.32894736842105265, "frac_reward_zero_std": 0.03125, "grad_norm": 0.012477315962314606, "learning_rate": 1e-06, "loss": 0.075, "num_tokens": 87448258.0, "reward": 0.2307177186012268, "reward_std": 0.18792694807052612, "rewards/progression_diversity/mean": -0.004401450511068106, "rewards/progression_diversity/std": 0.04930144548416138, "rewards/symbolic_reward_accuracy/mean": 0.1328125, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.5133463740348816, "rewards/symbolic_reward_partial_score/std": 0.250105082988739, "rewards/tag_count_reward/mean": -0.029296875, "rewards/tag_count_reward/std": 0.16880230605602264, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0609157085418701, "sampling/importance_sampling_ratio/min": 9.104550713345816e-08, "sampling/sampling_logp_difference/max": 16.21190643310547, "sampling/sampling_logp_difference/mean": 0.11256176233291626, "step": 125 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.2965611666440964, "epoch": 0.33157894736842103, "grad_norm": 0.012660843320190907, "learning_rate": 1e-06, "loss": 0.0433, "step": 126 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2932172417640686, "epoch": 0.33421052631578946, "grad_norm": 0.012092087417840958, "learning_rate": 1e-06, "loss": 0.0378, "step": 127 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.30187714099884033, "epoch": 0.3368421052631579, "grad_norm": 0.0075751859694719315, "learning_rate": 1e-06, "loss": 0.0291, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1831.0, "completions/mean_length": 945.626953125, "completions/mean_terminated_length": 606.6607055664062, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "entropy": 0.30951452255249023, "epoch": 0.3394736842105263, "frac_reward_zero_std": 0.03125, "grad_norm": 0.015834081918001175, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 88331555.0, "reward": 0.24679304659366608, "reward_std": 0.20520557463169098, "rewards/progression_diversity/mean": -0.003313412657007575, "rewards/progression_diversity/std": 0.04354095831513405, "rewards/symbolic_reward_accuracy/mean": 0.150390625, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.5291340947151184, "rewards/symbolic_reward_partial_score/std": 0.25848349928855896, "rewards/tag_count_reward/mean": -0.021484375, "rewards/tag_count_reward/std": 0.14513419568538666, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0643789768218994, "sampling/importance_sampling_ratio/min": 0.0014044058043509722, "sampling/sampling_logp_difference/max": 6.568140983581543, "sampling/sampling_logp_difference/mean": 0.12036062777042389, "step": 129 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2890625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3004407584667206, "epoch": 0.34210526315789475, "grad_norm": 0.008652472868561745, "learning_rate": 1e-06, "loss": 0.0179, "step": 130 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.28511108458042145, "epoch": 0.3447368421052632, "grad_norm": 0.011511228047311306, "learning_rate": 1e-06, "loss": 0.0139, "step": 131 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2902694195508957, "epoch": 0.3473684210526316, "grad_norm": 0.011798321269452572, "learning_rate": 1e-06, "loss": 0.0542, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021484375, "completions/max_length": 16384.0, "completions/max_terminated_length": 2008.0, "completions/mean_length": 944.07421875, "completions/mean_terminated_length": 605.0738525390625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "entropy": 0.2803247421979904, "epoch": 0.35, "frac_reward_zero_std": 0.0, "grad_norm": 0.01663512922823429, "learning_rate": 1e-06, "loss": 0.0881, "num_tokens": 89217385.0, "reward": 0.2798116207122803, "reward_std": 0.22263681888580322, "rewards/progression_diversity/mean": -0.002236358355730772, "rewards/progression_diversity/std": 0.03316602483391762, "rewards/symbolic_reward_accuracy/mean": 0.1953125, "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, "rewards/symbolic_reward_partial_score/mean": 0.5499674081802368, "rewards/symbolic_reward_partial_score/std": 0.2764817774295807, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0633987188339233, "sampling/importance_sampling_ratio/min": 5.007424078939948e-06, "sampling/sampling_logp_difference/max": 12.204588890075684, "sampling/sampling_logp_difference/mean": 0.11828067898750305, "step": 133 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3119271844625473, "epoch": 0.3526315789473684, "grad_norm": 0.014867684803903103, "learning_rate": 1e-06, "loss": 0.0002, "step": 134 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.289058193564415, "epoch": 0.35526315789473684, "grad_norm": 0.009648671373724937, "learning_rate": 1e-06, "loss": 0.0339, "step": 135 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.29756639897823334, "epoch": 0.35789473684210527, "grad_norm": 0.011878606863319874, "learning_rate": 1e-06, "loss": 0.0322, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15970.0, "completions/mean_length": 799.517578125, "completions/mean_terminated_length": 645.824462890625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.3042868822813034, "epoch": 0.3605263157894737, "frac_reward_zero_std": 0.0, "grad_norm": 0.012307998724281788, "learning_rate": 1e-06, "loss": 0.0108, "num_tokens": 90016530.0, "reward": 0.32415273785591125, "reward_std": 0.21440261602401733, "rewards/progression_diversity/mean": -0.0017208305653184652, "rewards/progression_diversity/std": 0.024005113169550896, "rewards/symbolic_reward_accuracy/mean": 0.248046875, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.5896810293197632, "rewards/symbolic_reward_partial_score/std": 0.2770000696182251, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0701053142547607, "sampling/importance_sampling_ratio/min": 0.000937148230150342, "sampling/sampling_logp_difference/max": 6.9726691246032715, "sampling/sampling_logp_difference/mean": 0.13084274530410767, "step": 137 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.31592294573783875, "epoch": 0.3631578947368421, "grad_norm": 0.013102422468364239, "learning_rate": 1e-06, "loss": -0.0013, "step": 138 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.2929012179374695, "epoch": 0.36578947368421055, "grad_norm": 0.012112054973840714, "learning_rate": 1e-06, "loss": 0.0511, "step": 139 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.30165381729602814, "epoch": 0.3684210526315789, "grad_norm": 0.011927537620067596, "learning_rate": 1e-06, "loss": 0.0245, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.017578125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14312.0, "completions/mean_length": 908.201171875, "completions/mean_terminated_length": 631.2981567382812, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3008659780025482, "epoch": 0.37105263157894736, "frac_reward_zero_std": 0.03125, "grad_norm": 0.012268126010894775, "learning_rate": 1e-06, "loss": 0.0135, "num_tokens": 90879481.0, "reward": 0.31674230098724365, "reward_std": 0.2281985729932785, "rewards/progression_diversity/mean": -0.0005779473576694727, "rewards/progression_diversity/std": 0.0075342655181884766, "rewards/symbolic_reward_accuracy/mean": 0.240234375, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.5831705927848816, "rewards/symbolic_reward_partial_score/std": 0.28232038021087646, "rewards/tag_count_reward/mean": -0.0234375, "rewards/tag_count_reward/std": 0.15143637359142303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0659434795379639, "sampling/importance_sampling_ratio/min": 0.00015740895469207317, "sampling/sampling_logp_difference/max": 8.75666332244873, "sampling/sampling_logp_difference/mean": 0.12366581708192825, "step": 141 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.2782522588968277, "epoch": 0.3736842105263158, "grad_norm": 0.01116311363875866, "learning_rate": 1e-06, "loss": 0.0528, "step": 142 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3003556579351425, "epoch": 0.3763157894736842, "grad_norm": 0.010078194551169872, "learning_rate": 1e-06, "loss": 0.0121, "step": 143 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.28932641446590424, "epoch": 0.37894736842105264, "grad_norm": 0.007731087971478701, "learning_rate": 1e-06, "loss": 0.0039, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02734375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1726.0, "completions/mean_length": 1018.0078125, "completions/mean_terminated_length": 586.0321044921875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "entropy": 0.2875199466943741, "epoch": 0.3815789473684211, "frac_reward_zero_std": 0.0, "grad_norm": 0.015733782202005386, "learning_rate": 1e-06, "loss": 0.0633, "num_tokens": 91793821.0, "reward": 0.33732670545578003, "reward_std": 0.2401069700717926, "rewards/progression_diversity/mean": -0.0026801489293575287, "rewards/progression_diversity/std": 0.03682376444339752, "rewards/symbolic_reward_accuracy/mean": 0.2734375, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.58935546875, "rewards/symbolic_reward_partial_score/std": 0.29994940757751465, "rewards/tag_count_reward/mean": -0.03515625, "rewards/tag_count_reward/std": 0.1843547374010086, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0609219074249268, "sampling/importance_sampling_ratio/min": 0.0008631742675788701, "sampling/sampling_logp_difference/max": 7.054893970489502, "sampling/sampling_logp_difference/mean": 0.11308446526527405, "step": 145 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.28733527660369873, "epoch": 0.38421052631578945, "grad_norm": 0.012077763676643372, "learning_rate": 1e-06, "loss": 0.0592, "step": 146 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.29585976898670197, "epoch": 0.3868421052631579, "grad_norm": 0.013086505234241486, "learning_rate": 1e-06, "loss": 0.0457, "step": 147 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3003746271133423, "epoch": 0.3894736842105263, "grad_norm": 0.010912450030446053, "learning_rate": 1e-06, "loss": 0.0111, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 811.55078125, "completions/mean_terminated_length": 595.695068359375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "entropy": 0.3026224821805954, "epoch": 0.39210526315789473, "frac_reward_zero_std": 0.0, "grad_norm": 0.012824556790292263, "learning_rate": 1e-06, "loss": 0.0151, "num_tokens": 92619159.0, "reward": 0.3143312335014343, "reward_std": 0.18259009718894958, "rewards/progression_diversity/mean": -0.0024227979592978954, "rewards/progression_diversity/std": 0.03060256503522396, "rewards/symbolic_reward_accuracy/mean": 0.232421875, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.5882161259651184, "rewards/symbolic_reward_partial_score/std": 0.2912585139274597, "rewards/tag_count_reward/mean": -0.015625, "rewards/tag_count_reward/std": 0.12414088100194931, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069498062133789, "sampling/importance_sampling_ratio/min": 0.0012500947341322899, "sampling/sampling_logp_difference/max": 6.684535980224609, "sampling/sampling_logp_difference/mean": 0.1314665824174881, "step": 149 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.2950987070798874, "epoch": 0.39473684210526316, "grad_norm": 0.010317061096429825, "learning_rate": 1e-06, "loss": 0.0318, "step": 150 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2954738140106201, "epoch": 0.3973684210526316, "grad_norm": 0.014257904142141342, "learning_rate": 1e-06, "loss": 0.0424, "step": 151 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3058134615421295, "epoch": 0.4, "grad_norm": 0.00726959016174078, "learning_rate": 1e-06, "loss": 0.0087, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.013671875, "completions/max_length": 16384.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 805.603515625, "completions/mean_terminated_length": 589.6653442382812, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "entropy": 0.30552275478839874, "epoch": 0.4026315789473684, "frac_reward_zero_std": 0.0, "grad_norm": 0.03944050893187523, "learning_rate": 1e-06, "loss": 0.0249, "num_tokens": 93433612.0, "reward": 0.3065668046474457, "reward_std": 0.22543519735336304, "rewards/progression_diversity/mean": -0.0025011475663632154, "rewards/progression_diversity/std": 0.029947273433208466, "rewards/symbolic_reward_accuracy/mean": 0.220703125, "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, "rewards/symbolic_reward_partial_score/mean": 0.5870768427848816, "rewards/symbolic_reward_partial_score/std": 0.2680220305919647, "rewards/tag_count_reward/mean": -0.01953125, "rewards/tag_count_reward/std": 0.1385180652141571, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.070122241973877, "sampling/importance_sampling_ratio/min": 3.465719419182278e-05, "sampling/sampling_logp_difference/max": 10.270005226135254, "sampling/sampling_logp_difference/mean": 0.1315053105354309, "step": 153 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3053535670042038, "epoch": 0.4052631578947368, "grad_norm": 0.011440174654126167, "learning_rate": 1e-06, "loss": 0.0116, "step": 154 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3068731129169464, "epoch": 0.40789473684210525, "grad_norm": 0.009822564199566841, "learning_rate": 1e-06, "loss": 0.0338, "step": 155 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2979227900505066, "epoch": 0.4105263157894737, "grad_norm": 0.01283957902342081, "learning_rate": 1e-06, "loss": 0.0337, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 763.703125, "completions/mean_terminated_length": 578.4822387695312, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.31284308433532715, "epoch": 0.4131578947368421, "frac_reward_zero_std": 0.0, "grad_norm": 0.015819117426872253, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 94233588.0, "reward": 0.3703927993774414, "reward_std": 0.24595896899700165, "rewards/progression_diversity/mean": -0.0017375880852341652, "rewards/progression_diversity/std": 0.024503301829099655, "rewards/symbolic_reward_accuracy/mean": 0.30859375, "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, "rewards/symbolic_reward_partial_score/mean": 0.6233723759651184, "rewards/symbolic_reward_partial_score/std": 0.3047916889190674, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0701302289962769, "sampling/importance_sampling_ratio/min": 8.33201596606159e-08, "sampling/sampling_logp_difference/max": 16.300575256347656, "sampling/sampling_logp_difference/mean": 0.13046219944953918, "step": 157 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.29947347939014435, "epoch": 0.41578947368421054, "grad_norm": 0.012457345612347126, "learning_rate": 1e-06, "loss": 0.0385, "step": 158 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2942707985639572, "epoch": 0.41842105263157897, "grad_norm": 0.007380845490843058, "learning_rate": 1e-06, "loss": 0.0188, "step": 159 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.2961069345474243, "epoch": 0.42105263157894735, "grad_norm": 0.009989511221647263, "learning_rate": 1e-06, "loss": 0.0147, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1410.0, "completions/mean_length": 733.279296875, "completions/mean_terminated_length": 578.9329223632812, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "entropy": 0.29683390259742737, "epoch": 0.4236842105263158, "frac_reward_zero_std": 0.0, "grad_norm": 0.012312391772866249, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 95011011.0, "reward": 0.3097507655620575, "reward_std": 0.21708126366138458, "rewards/progression_diversity/mean": -0.0014869628939777613, "rewards/progression_diversity/std": 0.020555421710014343, "rewards/symbolic_reward_accuracy/mean": 0.224609375, "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, "rewards/symbolic_reward_partial_score/mean": 0.5891926884651184, "rewards/symbolic_reward_partial_score/std": 0.28653624653816223, "rewards/tag_count_reward/mean": -0.017578125, "rewards/tag_count_reward/std": 0.13154059648513794, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0722882747650146, "sampling/importance_sampling_ratio/min": 0.0011683765333145857, "sampling/sampling_logp_difference/max": 6.752140045166016, "sampling/sampling_logp_difference/mean": 0.1345098316669464, "step": 161 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4921875, "entropy": 0.29501038789749146, "epoch": 0.4263157894736842, "grad_norm": 0.011208836920559406, "learning_rate": 1e-06, "loss": 0.0448, "step": 162 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.31285424530506134, "epoch": 0.42894736842105263, "grad_norm": 0.01194053515791893, "learning_rate": 1e-06, "loss": 0.0047, "step": 163 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.2996787875890732, "epoch": 0.43157894736842106, "grad_norm": 0.0087357759475708, "learning_rate": 1e-06, "loss": 0.0212, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12065.0, "completions/mean_length": 683.697265625, "completions/mean_terminated_length": 591.1611328125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "entropy": 0.31017276644706726, "epoch": 0.4342105263157895, "frac_reward_zero_std": 0.0, "grad_norm": 0.014013568870723248, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 95761384.0, "reward": 0.3427630066871643, "reward_std": 0.21610036492347717, "rewards/progression_diversity/mean": -0.0010415659053251147, "rewards/progression_diversity/std": 0.01706579513847828, "rewards/symbolic_reward_accuracy/mean": 0.263671875, "rewards/symbolic_reward_accuracy/std": 0.4410543739795685, "rewards/symbolic_reward_partial_score/mean": 0.6165364980697632, "rewards/symbolic_reward_partial_score/std": 0.2920151948928833, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0745972394943237, "sampling/importance_sampling_ratio/min": 0.000583315675612539, "sampling/sampling_logp_difference/max": 7.446782112121582, "sampling/sampling_logp_difference/mean": 0.1369236409664154, "step": 165 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.29873988032341003, "epoch": 0.4368421052631579, "grad_norm": 0.012282563373446465, "learning_rate": 1e-06, "loss": 0.0157, "step": 166 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.296875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.3020479381084442, "epoch": 0.4394736842105263, "grad_norm": 0.00869888998568058, "learning_rate": 1e-06, "loss": 0.018, "step": 167 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3060583770275116, "epoch": 0.4421052631578947, "grad_norm": 0.008922635577619076, "learning_rate": 1e-06, "loss": 0.0002, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 705.625, "completions/mean_terminated_length": 551.0059204101562, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3060384541749954, "epoch": 0.44473684210526315, "frac_reward_zero_std": 0.0, "grad_norm": 0.018640100955963135, "learning_rate": 1e-06, "loss": 0.005, "num_tokens": 96506920.0, "reward": 0.44208988547325134, "reward_std": 0.26640045642852783, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.3984375, "rewards/symbolic_reward_accuracy/std": 0.4900552034378052, "rewards/symbolic_reward_partial_score/mean": 0.6800130009651184, "rewards/symbolic_reward_partial_score/std": 0.3060167729854584, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0716135501861572, "sampling/importance_sampling_ratio/min": 0.0021636190358549356, "sampling/sampling_logp_difference/max": 6.13597297668457, "sampling/sampling_logp_difference/mean": 0.1341308355331421, "step": 169 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.29913921654224396, "epoch": 0.4473684210526316, "grad_norm": 0.007306138519197702, "learning_rate": 1e-06, "loss": 0.0337, "step": 170 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.3203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.30154837667942047, "epoch": 0.45, "grad_norm": 0.016574641689658165, "learning_rate": 1e-06, "loss": 0.0075, "step": 171 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.30604830384254456, "epoch": 0.45263157894736844, "grad_norm": 0.013307604938745499, "learning_rate": 1e-06, "loss": -0.0038, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 626.103515625, "completions/mean_terminated_length": 533.2279052734375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.30806519091129303, "epoch": 0.45526315789473687, "frac_reward_zero_std": 0.0, "grad_norm": 0.015151865780353546, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 97206429.0, "reward": 0.41551151871681213, "reward_std": 0.19223448634147644, "rewards/progression_diversity/mean": -0.0015838092658668756, "rewards/progression_diversity/std": 0.0340169332921505, "rewards/symbolic_reward_accuracy/mean": 0.35546875, "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, "rewards/symbolic_reward_partial_score/mean": 0.6767578125, "rewards/symbolic_reward_partial_score/std": 0.2906571328639984, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0748977661132812, "sampling/importance_sampling_ratio/min": 0.00040466117206960917, "sampling/sampling_logp_difference/max": 7.812460422515869, "sampling/sampling_logp_difference/mean": 0.1398119330406189, "step": 173 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3104834258556366, "epoch": 0.45789473684210524, "grad_norm": 0.01459193229675293, "learning_rate": 1e-06, "loss": 0.0219, "step": 174 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.30411429703235626, "epoch": 0.4605263157894737, "grad_norm": 0.010530464351177216, "learning_rate": 1e-06, "loss": 0.0014, "step": 175 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3101528435945511, "epoch": 0.4631578947368421, "grad_norm": 0.01191193237900734, "learning_rate": 1e-06, "loss": 0.014, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009765625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1359.0, "completions/mean_length": 706.248046875, "completions/mean_terminated_length": 551.6351318359375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.3114381581544876, "epoch": 0.46578947368421053, "frac_reward_zero_std": 0.0, "grad_norm": 0.016358286142349243, "learning_rate": 1e-06, "loss": -0.0037, "num_tokens": 97972860.0, "reward": 0.3487420082092285, "reward_std": 0.22794780135154724, "rewards/progression_diversity/mean": -0.003728086594492197, "rewards/progression_diversity/std": 0.046275150030851364, "rewards/symbolic_reward_accuracy/mean": 0.271484375, "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, "rewards/symbolic_reward_partial_score/mean": 0.6222330331802368, "rewards/symbolic_reward_partial_score/std": 0.2960527837276459, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0715268850326538, "sampling/importance_sampling_ratio/min": 2.043495624093339e-05, "sampling/sampling_logp_difference/max": 10.798263549804688, "sampling/sampling_logp_difference/mean": 0.1340111345052719, "step": 177 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.29386352002620697, "epoch": 0.46842105263157896, "grad_norm": 0.013762143440544605, "learning_rate": 1e-06, "loss": 0.0507, "step": 178 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.3153984248638153, "epoch": 0.4710526315789474, "grad_norm": 0.007509378716349602, "learning_rate": 1e-06, "loss": 0.0019, "step": 179 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3007982224225998, "epoch": 0.47368421052631576, "grad_norm": 0.012465434148907661, "learning_rate": 1e-06, "loss": 0.0113, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 710.966796875, "completions/mean_terminated_length": 525.1205444335938, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "entropy": 0.31515413522720337, "epoch": 0.4763157894736842, "frac_reward_zero_std": 0.03125, "grad_norm": 0.014978190883994102, "learning_rate": 1e-06, "loss": 0.0238, "num_tokens": 98750187.0, "reward": 0.40842536091804504, "reward_std": 0.22665637731552124, "rewards/progression_diversity/mean": -0.002191566862165928, "rewards/progression_diversity/std": 0.033938221633434296, "rewards/symbolic_reward_accuracy/mean": 0.35546875, "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, "rewards/symbolic_reward_partial_score/mean": 0.6531575918197632, "rewards/symbolic_reward_partial_score/std": 0.31162047386169434, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0704665184020996, "sampling/importance_sampling_ratio/min": 0.000455824047094211, "sampling/sampling_logp_difference/max": 7.693403720855713, "sampling/sampling_logp_difference/mean": 0.13239729404449463, "step": 181 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.30028870701789856, "epoch": 0.4789473684210526, "grad_norm": 0.008311889134347439, "learning_rate": 1e-06, "loss": 0.0028, "step": 182 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3118121027946472, "epoch": 0.48157894736842105, "grad_norm": 0.009793510660529137, "learning_rate": 1e-06, "loss": 0.0052, "step": 183 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3031664490699768, "epoch": 0.4842105263157895, "grad_norm": 0.010256760753691196, "learning_rate": 1e-06, "loss": 0.0198, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 563.029296875, "completions/mean_terminated_length": 532.0684814453125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.3078819364309311, "epoch": 0.4868421052631579, "frac_reward_zero_std": 0.0, "grad_norm": 0.01614123024046421, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 99453114.0, "reward": 0.39682406187057495, "reward_std": 0.22970899939537048, "rewards/progression_diversity/mean": -0.00021465322060976177, "rewards/progression_diversity/std": 0.00485704792663455, "rewards/symbolic_reward_accuracy/mean": 0.330078125, "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, "rewards/symbolic_reward_partial_score/mean": 0.66259765625, "rewards/symbolic_reward_partial_score/std": 0.28932827711105347, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078159213066101, "sampling/importance_sampling_ratio/min": 5.380706716096029e-05, "sampling/sampling_logp_difference/max": 9.830105781555176, "sampling/sampling_logp_difference/mean": 0.14422446489334106, "step": 185 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3147430270910263, "epoch": 0.48947368421052634, "grad_norm": 0.011492928490042686, "learning_rate": 1e-06, "loss": 0.001, "step": 186 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.28125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5078125, "entropy": 0.3014184534549713, "epoch": 0.4921052631578947, "grad_norm": 0.009395534172654152, "learning_rate": 1e-06, "loss": 0.0091, "step": 187 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.30802275240421295, "epoch": 0.49473684210526314, "grad_norm": 0.010077573359012604, "learning_rate": 1e-06, "loss": 0.0012, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 525.791015625, "completions/mean_terminated_length": 525.791015625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.30758512020111084, "epoch": 0.49736842105263157, "frac_reward_zero_std": 0.0, "grad_norm": 0.014145960099995136, "learning_rate": 1e-06, "loss": -0.008, "num_tokens": 100124943.0, "reward": 0.44975587725639343, "reward_std": 0.23507720232009888, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.396484375, "rewards/symbolic_reward_accuracy/std": 0.4896455705165863, "rewards/symbolic_reward_partial_score/mean": 0.7062174677848816, "rewards/symbolic_reward_partial_score/std": 0.28074365854263306, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0785939693450928, "sampling/importance_sampling_ratio/min": 6.271932943491265e-05, "sampling/sampling_logp_difference/max": 9.676840782165527, "sampling/sampling_logp_difference/mean": 0.1481301635503769, "step": 189 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.5, "entropy": 0.3067742586135864, "epoch": 0.5, "grad_norm": 0.00825838465243578, "learning_rate": 1e-06, "loss": 0.0034, "step": 190 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.31375977396965027, "epoch": 0.5026315789473684, "grad_norm": 0.014494653791189194, "learning_rate": 1e-06, "loss": 0.0033, "step": 191 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3070768415927887, "epoch": 0.5052631578947369, "grad_norm": 0.00879518873989582, "learning_rate": 1e-06, "loss": 0.0032, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1541.0, "completions/mean_length": 595.572265625, "completions/mean_terminated_length": 533.6569213867188, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.3034219741821289, "epoch": 0.5078947368421053, "frac_reward_zero_std": 0.03125, "grad_norm": 0.013428542762994766, "learning_rate": 1e-06, "loss": 0.0122, "num_tokens": 100829172.0, "reward": 0.4348069429397583, "reward_std": 0.23271231353282928, "rewards/progression_diversity/mean": -0.000752636871766299, "rewards/progression_diversity/std": 0.017030227929353714, "rewards/symbolic_reward_accuracy/mean": 0.380859375, "rewards/symbolic_reward_accuracy/std": 0.48607301712036133, "rewards/symbolic_reward_partial_score/mean": 0.6896159052848816, "rewards/symbolic_reward_partial_score/std": 0.3018878102302551, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0762717723846436, "sampling/importance_sampling_ratio/min": 1.2637668987736106e-05, "sampling/sampling_logp_difference/max": 11.278828620910645, "sampling/sampling_logp_difference/mean": 0.1418013572692871, "step": 193 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3115580826997757, "epoch": 0.5105263157894737, "grad_norm": 0.010667307302355766, "learning_rate": 1e-06, "loss": -0.0013, "step": 194 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.307052806019783, "epoch": 0.5131578947368421, "grad_norm": 0.013530107215046883, "learning_rate": 1e-06, "loss": -0.0067, "step": 195 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4765625, "entropy": 0.3012283742427826, "epoch": 0.5157894736842106, "grad_norm": 0.01216200552880764, "learning_rate": 1e-06, "loss": 0.0267, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 580.47265625, "completions/mean_terminated_length": 518.498046875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.3057010918855667, "epoch": 0.5184210526315789, "frac_reward_zero_std": 0.0, "grad_norm": 0.014998500235378742, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 101530182.0, "reward": 0.415574848651886, "reward_std": 0.25428831577301025, "rewards/progression_diversity/mean": -0.00013565561675932258, "rewards/progression_diversity/std": 0.003069536527618766, "rewards/symbolic_reward_accuracy/mean": 0.3515625, "rewards/symbolic_reward_accuracy/std": 0.4779251217842102, "rewards/symbolic_reward_partial_score/mean": 0.68408203125, "rewards/symbolic_reward_partial_score/std": 0.2947002053260803, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0763626098632812, "sampling/importance_sampling_ratio/min": 0.0009694536565802991, "sampling/sampling_logp_difference/max": 6.938777923583984, "sampling/sampling_logp_difference/mean": 0.14132827520370483, "step": 197 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.515625, "entropy": 0.30432167649269104, "epoch": 0.5210526315789473, "grad_norm": 0.009974686428904533, "learning_rate": 1e-06, "loss": -0.0018, "step": 198 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3102835714817047, "epoch": 0.5236842105263158, "grad_norm": 0.013394089415669441, "learning_rate": 1e-06, "loss": 0.0013, "step": 199 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3013733774423599, "epoch": 0.5263157894736842, "grad_norm": 0.007236282341182232, "learning_rate": 1e-06, "loss": 0.0109, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15202.0, "completions/mean_length": 670.62109375, "completions/mean_terminated_length": 546.8936767578125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.295871302485466, "epoch": 0.5289473684210526, "frac_reward_zero_std": 0.03125, "grad_norm": 0.013714026659727097, "learning_rate": 1e-06, "loss": 0.053, "num_tokens": 102291044.0, "reward": 0.40374255180358887, "reward_std": 0.2347850203514099, "rewards/progression_diversity/mean": -0.0017245247727259994, "rewards/progression_diversity/std": 0.027910111472010612, "rewards/symbolic_reward_accuracy/mean": 0.337890625, "rewards/symbolic_reward_accuracy/std": 0.4734536409378052, "rewards/symbolic_reward_partial_score/mean": 0.6739909052848816, "rewards/symbolic_reward_partial_score/std": 0.28590893745422363, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0722577571868896, "sampling/importance_sampling_ratio/min": 0.0002544128510635346, "sampling/sampling_logp_difference/max": 8.276552200317383, "sampling/sampling_logp_difference/mean": 0.13411201536655426, "step": 201 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.31462037563323975, "epoch": 0.531578947368421, "grad_norm": 0.011020858772099018, "learning_rate": 1e-06, "loss": -0.0013, "step": 202 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3070591688156128, "epoch": 0.5342105263157895, "grad_norm": 0.008980306796729565, "learning_rate": 1e-06, "loss": 0.0029, "step": 203 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.30502383410930634, "epoch": 0.5368421052631579, "grad_norm": 0.012147264555096626, "learning_rate": 1e-06, "loss": 0.0231, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 577.3671875, "completions/mean_terminated_length": 515.3804321289062, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.30741482973098755, "epoch": 0.5394736842105263, "frac_reward_zero_std": 0.0625, "grad_norm": 0.010803978890180588, "learning_rate": 1e-06, "loss": 0.0152, "num_tokens": 102999648.0, "reward": 0.4042305052280426, "reward_std": 0.18462583422660828, "rewards/progression_diversity/mean": -0.0017548013711348176, "rewards/progression_diversity/std": 0.034413598477840424, "rewards/symbolic_reward_accuracy/mean": 0.33203125, "rewards/symbolic_reward_accuracy/std": 0.47140273451805115, "rewards/symbolic_reward_partial_score/mean": 0.6847330927848816, "rewards/symbolic_reward_partial_score/std": 0.27975600957870483, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077283263206482, "sampling/importance_sampling_ratio/min": 0.0026723169721663, "sampling/sampling_logp_difference/max": 5.924809455871582, "sampling/sampling_logp_difference/mean": 0.14251382648944855, "step": 205 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3081301152706146, "epoch": 0.5421052631578948, "grad_norm": 0.007183319889008999, "learning_rate": 1e-06, "loss": -0.0016, "step": 206 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3076689690351486, "epoch": 0.5447368421052632, "grad_norm": 0.01192345842719078, "learning_rate": 1e-06, "loss": -0.0043, "step": 207 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3079235255718231, "epoch": 0.5473684210526316, "grad_norm": 0.007967538200318813, "learning_rate": 1e-06, "loss": 0.0191, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 527.55859375, "completions/mean_terminated_length": 496.52838134765625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "entropy": 0.316678062081337, "epoch": 0.55, "frac_reward_zero_std": 0.03125, "grad_norm": 0.012876125983893871, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 103652030.0, "reward": 0.5482417941093445, "reward_std": 0.24300527572631836, "rewards/progression_diversity/mean": -4.2574582039378583e-05, "rewards/progression_diversity/std": 0.0009633527952246368, "rewards/symbolic_reward_accuracy/mean": 0.52734375, "rewards/symbolic_reward_accuracy/std": 0.49974003434181213, "rewards/symbolic_reward_partial_score/mean": 0.7734375, "rewards/symbolic_reward_partial_score/std": 0.28300556540489197, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078352689743042, "sampling/importance_sampling_ratio/min": 0.000436106463894248, "sampling/sampling_logp_difference/max": 7.737624168395996, "sampling/sampling_logp_difference/mean": 0.1448683738708496, "step": 209 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.31302228569984436, "epoch": 0.5526315789473685, "grad_norm": 0.011758224107325077, "learning_rate": 1e-06, "loss": 0.0025, "step": 210 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.30750422179698944, "epoch": 0.5552631578947368, "grad_norm": 0.007878407835960388, "learning_rate": 1e-06, "loss": 0.0009, "step": 211 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.2995268553495407, "epoch": 0.5578947368421052, "grad_norm": 0.010879729874432087, "learning_rate": 1e-06, "loss": 0.0096, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 612.904296875, "completions/mean_terminated_length": 519.950927734375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "entropy": 0.2970995306968689, "epoch": 0.5605263157894737, "frac_reward_zero_std": 0.0, "grad_norm": 0.013787680305540562, "learning_rate": 1e-06, "loss": 0.0091, "num_tokens": 104361645.0, "reward": 0.44565433263778687, "reward_std": 0.21747955679893494, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.388671875, "rewards/symbolic_reward_accuracy/std": 0.4879252314567566, "rewards/symbolic_reward_partial_score/mean": 0.7120768427848816, "rewards/symbolic_reward_partial_score/std": 0.2797081768512726, "rewards/tag_count_reward/mean": -0.01171875, "rewards/tag_count_reward/std": 0.10772226005792618, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0732554197311401, "sampling/importance_sampling_ratio/min": 6.989557732595131e-05, "sampling/sampling_logp_difference/max": 9.56850814819336, "sampling/sampling_logp_difference/mean": 0.13793891668319702, "step": 213 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.31382423639297485, "epoch": 0.5631578947368421, "grad_norm": 0.011133058927953243, "learning_rate": 1e-06, "loss": -0.0007, "step": 214 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.3052579313516617, "epoch": 0.5657894736842105, "grad_norm": 0.01070347335189581, "learning_rate": 1e-06, "loss": 0.0011, "step": 215 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.2916347533464432, "epoch": 0.5684210526315789, "grad_norm": 0.008687763474881649, "learning_rate": 1e-06, "loss": 0.0397, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3699.0, "completions/max_terminated_length": 3699.0, "completions/mean_length": 499.142578125, "completions/mean_terminated_length": 499.142578125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3144657015800476, "epoch": 0.5710526315789474, "frac_reward_zero_std": 0.125, "grad_norm": 0.012763739563524723, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 105019350.0, "reward": 0.44731447100639343, "reward_std": 0.19935737550258636, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.380859375, "rewards/symbolic_reward_accuracy/std": 0.48607301712036133, "rewards/symbolic_reward_partial_score/mean": 0.7293294668197632, "rewards/symbolic_reward_partial_score/std": 0.2697790563106537, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07834792137146, "sampling/importance_sampling_ratio/min": 0.001173157594166696, "sampling/sampling_logp_difference/max": 6.748056411743164, "sampling/sampling_logp_difference/mean": 0.14621981978416443, "step": 217 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3075670599937439, "epoch": 0.5736842105263158, "grad_norm": 0.006303088273853064, "learning_rate": 1e-06, "loss": -0.0005, "step": 218 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.2734375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3094196021556854, "epoch": 0.5763157894736842, "grad_norm": 0.011260257102549076, "learning_rate": 1e-06, "loss": -0.0, "step": 219 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3103173077106476, "epoch": 0.5789473684210527, "grad_norm": 0.010579501278698444, "learning_rate": 1e-06, "loss": 0.0026, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 595.380859375, "completions/mean_terminated_length": 502.3241882324219, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "entropy": 0.30067962408065796, "epoch": 0.5815789473684211, "frac_reward_zero_std": 0.03125, "grad_norm": 0.014474834315478802, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 105736857.0, "reward": 0.38732582330703735, "reward_std": 0.18756034970283508, "rewards/progression_diversity/mean": -0.0027724693063646555, "rewards/progression_diversity/std": 0.04055400565266609, "rewards/symbolic_reward_accuracy/mean": 0.30078125, "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, "rewards/symbolic_reward_partial_score/mean": 0.69091796875, "rewards/symbolic_reward_partial_score/std": 0.26963144540786743, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0751022100448608, "sampling/importance_sampling_ratio/min": 0.00019873835844919086, "sampling/sampling_logp_difference/max": 8.523521423339844, "sampling/sampling_logp_difference/mean": 0.13958622515201569, "step": 221 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.3127920478582382, "epoch": 0.5842105263157895, "grad_norm": 0.010545584373176098, "learning_rate": 1e-06, "loss": 0.006, "step": 222 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3065568804740906, "epoch": 0.5868421052631579, "grad_norm": 0.008446205407381058, "learning_rate": 1e-06, "loss": -0.0092, "step": 223 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.31316129863262177, "epoch": 0.5894736842105263, "grad_norm": 0.006929857190698385, "learning_rate": 1e-06, "loss": 0.0128, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 520.384765625, "completions/mean_terminated_length": 489.34051513671875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "entropy": 0.3118240088224411, "epoch": 0.5921052631578947, "frac_reward_zero_std": 0.03125, "grad_norm": 0.013699988834559917, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 106365054.0, "reward": 0.5598512887954712, "reward_std": 0.22764000296592712, "rewards/progression_diversity/mean": -0.0012044012546539307, "rewards/progression_diversity/std": 0.02724718675017357, "rewards/symbolic_reward_accuracy/mean": 0.5390625, "rewards/symbolic_reward_accuracy/std": 0.4989593029022217, "rewards/symbolic_reward_partial_score/mean": 0.7893880605697632, "rewards/symbolic_reward_partial_score/std": 0.2728375494480133, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0779380798339844, "sampling/importance_sampling_ratio/min": 0.0002490824554115534, "sampling/sampling_logp_difference/max": 8.29772663116455, "sampling/sampling_logp_difference/mean": 0.1458706259727478, "step": 225 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.310558557510376, "epoch": 0.5947368421052631, "grad_norm": 0.008256916888058186, "learning_rate": 1e-06, "loss": 0.0002, "step": 226 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.3150336593389511, "epoch": 0.5973684210526315, "grad_norm": 0.007905205711722374, "learning_rate": 1e-06, "loss": 0.0038, "step": 227 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3053455650806427, "epoch": 0.6, "grad_norm": 0.011195844039320946, "learning_rate": 1e-06, "loss": -0.0038, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 490.607421875, "completions/mean_terminated_length": 490.607421875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.3094610720872879, "epoch": 0.6026315789473684, "frac_reward_zero_std": 0.09375, "grad_norm": 0.015274910256266594, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 107001365.0, "reward": 0.535107433795929, "reward_std": 0.21647313237190247, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.5, "rewards/symbolic_reward_accuracy/std": 0.5004889965057373, "rewards/symbolic_reward_partial_score/mean": 0.7843424677848816, "rewards/symbolic_reward_partial_score/std": 0.2582632303237915, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0789885520935059, "sampling/importance_sampling_ratio/min": 3.823011411441257e-06, "sampling/sampling_logp_difference/max": 12.474472045898438, "sampling/sampling_logp_difference/mean": 0.14612674713134766, "step": 229 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.310358390212059, "epoch": 0.6052631578947368, "grad_norm": 0.009558123536407948, "learning_rate": 1e-06, "loss": 0.0028, "step": 230 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3124067932367325, "epoch": 0.6078947368421053, "grad_norm": 0.007349732331931591, "learning_rate": 1e-06, "loss": 0.0012, "step": 231 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4375, "entropy": 0.3077686280012131, "epoch": 0.6105263157894737, "grad_norm": 0.010673035867512226, "learning_rate": 1e-06, "loss": -0.0025, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 490.693359375, "completions/mean_terminated_length": 490.693359375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.3138282001018524, "epoch": 0.6131578947368421, "frac_reward_zero_std": 0.0625, "grad_norm": 0.012326201424002647, "learning_rate": 1e-06, "loss": -0.0036, "num_tokens": 107653080.0, "reward": 0.42856448888778687, "reward_std": 0.21210803091526031, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.357421875, "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, "rewards/symbolic_reward_partial_score/mean": 0.71435546875, "rewards/symbolic_reward_partial_score/std": 0.26391804218292236, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.079890489578247, "sampling/importance_sampling_ratio/min": 7.049969281069934e-05, "sampling/sampling_logp_difference/max": 9.55990219116211, "sampling/sampling_logp_difference/mean": 0.14701983332633972, "step": 233 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.30451372265815735, "epoch": 0.6157894736842106, "grad_norm": 0.009723720140755177, "learning_rate": 1e-06, "loss": 0.0015, "step": 234 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.30988383293151855, "epoch": 0.618421052631579, "grad_norm": 0.008871919475495815, "learning_rate": 1e-06, "loss": 0.0007, "step": 235 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.25, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.3084511011838913, "epoch": 0.6210526315789474, "grad_norm": 0.006157966796308756, "learning_rate": 1e-06, "loss": 0.0038, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1348.0, "completions/mean_length": 527.09375, "completions/mean_terminated_length": 496.0626220703125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.3075174540281296, "epoch": 0.6236842105263158, "frac_reward_zero_std": 0.0625, "grad_norm": 0.014929900877177715, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 108329928.0, "reward": 0.47631800174713135, "reward_std": 0.18285810947418213, "rewards/progression_diversity/mean": -3.653179373941384e-05, "rewards/progression_diversity/std": 0.0008266201475635171, "rewards/symbolic_reward_accuracy/mean": 0.416015625, "rewards/symbolic_reward_accuracy/std": 0.493378221988678, "rewards/symbolic_reward_partial_score/mean": 0.75634765625, "rewards/symbolic_reward_partial_score/std": 0.2513831555843353, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0761387348175049, "sampling/importance_sampling_ratio/min": 0.00018604067736305296, "sampling/sampling_logp_difference/max": 8.589545249938965, "sampling/sampling_logp_difference/mean": 0.14305877685546875, "step": 237 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.30407415330410004, "epoch": 0.6263157894736842, "grad_norm": 0.010024651885032654, "learning_rate": 1e-06, "loss": 0.0205, "step": 238 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.29691192507743835, "epoch": 0.6289473684210526, "grad_norm": 0.008623898029327393, "learning_rate": 1e-06, "loss": 0.0025, "step": 239 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3079955577850342, "epoch": 0.631578947368421, "grad_norm": 0.012140064500272274, "learning_rate": 1e-06, "loss": 0.0027, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12821.0, "completions/mean_length": 544.341796875, "completions/mean_terminated_length": 513.3444213867188, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3042522668838501, "epoch": 0.6342105263157894, "frac_reward_zero_std": 0.09375, "grad_norm": 0.014113095588982105, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 108989399.0, "reward": 0.5777234435081482, "reward_std": 0.18902507424354553, "rewards/progression_diversity/mean": -0.0010965826222673059, "rewards/progression_diversity/std": 0.024812830612063408, "rewards/symbolic_reward_accuracy/mean": 0.560546875, "rewards/symbolic_reward_accuracy/std": 0.49680593609809875, "rewards/symbolic_reward_partial_score/mean": 0.8059896230697632, "rewards/symbolic_reward_partial_score/std": 0.2615269124507904, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0745680332183838, "sampling/importance_sampling_ratio/min": 4.572784291667631e-06, "sampling/sampling_logp_difference/max": 12.295388221740723, "sampling/sampling_logp_difference/mean": 0.14019736647605896, "step": 241 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3048563450574875, "epoch": 0.6368421052631579, "grad_norm": 0.006459313910454512, "learning_rate": 1e-06, "loss": 0.0008, "step": 242 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.30121709406375885, "epoch": 0.6394736842105263, "grad_norm": 0.016183165833353996, "learning_rate": 1e-06, "loss": 0.0208, "step": 243 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.328125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.29656779766082764, "epoch": 0.6421052631578947, "grad_norm": 0.010587197728455067, "learning_rate": 1e-06, "loss": 0.0169, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 13638.0, "completions/max_terminated_length": 13638.0, "completions/mean_length": 520.732421875, "completions/mean_terminated_length": 520.732421875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.3021259307861328, "epoch": 0.6447368421052632, "frac_reward_zero_std": 0.09375, "grad_norm": 0.013214373961091042, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 109673678.0, "reward": 0.42620930075645447, "reward_std": 0.1578913778066635, "rewards/progression_diversity/mean": -0.0011426578275859356, "rewards/progression_diversity/std": 0.02516518160700798, "rewards/symbolic_reward_accuracy/mean": 0.341796875, "rewards/symbolic_reward_accuracy/std": 0.4747757613658905, "rewards/symbolic_reward_partial_score/mean": 0.73779296875, "rewards/symbolic_reward_partial_score/std": 0.23799775540828705, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075998306274414, "sampling/importance_sampling_ratio/min": 0.0007776974816806614, "sampling/sampling_logp_difference/max": 7.159173011779785, "sampling/sampling_logp_difference/mean": 0.14280164241790771, "step": 245 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.3004751205444336, "epoch": 0.6473684210526316, "grad_norm": 0.00852108933031559, "learning_rate": 1e-06, "loss": 0.0024, "step": 246 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3045208901166916, "epoch": 0.65, "grad_norm": 0.01242540031671524, "learning_rate": 1e-06, "loss": 0.0037, "step": 247 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.2974863350391388, "epoch": 0.6526315789473685, "grad_norm": 0.011423774063587189, "learning_rate": 1e-06, "loss": 0.008, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 529.646484375, "completions/mean_terminated_length": 498.620361328125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3006265312433243, "epoch": 0.6552631578947369, "frac_reward_zero_std": 0.09375, "grad_norm": 0.013644593767821789, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 110364185.0, "reward": 0.4431152641773224, "reward_std": 0.1808246374130249, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.365234375, "rewards/symbolic_reward_accuracy/std": 0.4819667339324951, "rewards/symbolic_reward_partial_score/mean": 0.7472330927848816, "rewards/symbolic_reward_partial_score/std": 0.22773893177509308, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0767557621002197, "sampling/importance_sampling_ratio/min": 0.0005200820742174983, "sampling/sampling_logp_difference/max": 7.561523914337158, "sampling/sampling_logp_difference/mean": 0.1415756642818451, "step": 249 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.30361469089984894, "epoch": 0.6578947368421053, "grad_norm": 0.007451191544532776, "learning_rate": 1e-06, "loss": 0.0067, "step": 250 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.30292457342147827, "epoch": 0.6605263157894737, "grad_norm": 0.011732818558812141, "learning_rate": 1e-06, "loss": -0.002, "step": 251 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.30056828260421753, "epoch": 0.6631578947368421, "grad_norm": 0.006881546229124069, "learning_rate": 1e-06, "loss": 0.0062, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 606.2890625, "completions/mean_terminated_length": 513.2966918945312, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.29490475356578827, "epoch": 0.6657894736842105, "frac_reward_zero_std": 0.09375, "grad_norm": 0.017659462988376617, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 111082765.0, "reward": 0.4332265555858612, "reward_std": 0.18931256234645844, "rewards/progression_diversity/mean": -0.0025423369370400906, "rewards/progression_diversity/std": 0.03626888990402222, "rewards/symbolic_reward_accuracy/mean": 0.357421875, "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, "rewards/symbolic_reward_partial_score/mean": 0.7325845956802368, "rewards/symbolic_reward_partial_score/std": 0.2550489902496338, "rewards/tag_count_reward/mean": -0.009765625, "rewards/tag_count_reward/std": 0.09843364357948303, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0724674463272095, "sampling/importance_sampling_ratio/min": 6.315225618891418e-05, "sampling/sampling_logp_difference/max": 9.669961929321289, "sampling/sampling_logp_difference/mean": 0.13574518263339996, "step": 253 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.2578125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.292779803276062, "epoch": 0.6684210526315789, "grad_norm": 0.008213960565626621, "learning_rate": 1e-06, "loss": 0.0173, "step": 254 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.29811522364616394, "epoch": 0.6710526315789473, "grad_norm": 0.01159645989537239, "learning_rate": 1e-06, "loss": 0.0013, "step": 255 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.3002711534500122, "epoch": 0.6736842105263158, "grad_norm": 0.01329131331294775, "learning_rate": 1e-06, "loss": 0.0049, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1511.0, "completions/mean_length": 602.314453125, "completions/mean_terminated_length": 509.29864501953125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2865429222583771, "epoch": 0.6763157894736842, "frac_reward_zero_std": 0.0625, "grad_norm": 0.012785503640770912, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 111793454.0, "reward": 0.5003780126571655, "reward_std": 0.21868066489696503, "rewards/progression_diversity/mean": -0.0012658978812396526, "rewards/progression_diversity/std": 0.02685299515724182, "rewards/symbolic_reward_accuracy/mean": 0.453125, "rewards/symbolic_reward_accuracy/std": 0.4982847273349762, "rewards/symbolic_reward_partial_score/mean": 0.7630208730697632, "rewards/symbolic_reward_partial_score/std": 0.26299402117729187, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0711028575897217, "sampling/importance_sampling_ratio/min": 0.00016783220053184777, "sampling/sampling_logp_difference/max": 8.692545890808105, "sampling/sampling_logp_difference/mean": 0.13469204306602478, "step": 257 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2916171997785568, "epoch": 0.6789473684210526, "grad_norm": 0.00957430712878704, "learning_rate": 1e-06, "loss": 0.0051, "step": 258 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.29992710053920746, "epoch": 0.6815789473684211, "grad_norm": 0.009682898409664631, "learning_rate": 1e-06, "loss": -0.0027, "step": 259 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.3045227527618408, "epoch": 0.6842105263157895, "grad_norm": 0.010915652848780155, "learning_rate": 1e-06, "loss": -0.0018, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 499.568359375, "completions/mean_terminated_length": 499.568359375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.29767170548439026, "epoch": 0.6868421052631579, "frac_reward_zero_std": 0.09375, "grad_norm": 0.013380862772464752, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 112431025.0, "reward": 0.503369152545929, "reward_std": 0.20027483999729156, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.455078125, "rewards/symbolic_reward_accuracy/std": 0.4984649419784546, "rewards/symbolic_reward_partial_score/mean": 0.7683919072151184, "rewards/symbolic_reward_partial_score/std": 0.2547670304775238, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077869176864624, "sampling/importance_sampling_ratio/min": 8.587339107180014e-05, "sampling/sampling_logp_difference/max": 9.36263656616211, "sampling/sampling_logp_difference/mean": 0.14571282267570496, "step": 261 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.30793944001197815, "epoch": 0.6894736842105263, "grad_norm": 0.009117713198065758, "learning_rate": 1e-06, "loss": -0.0024, "step": 262 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3046875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3103470057249069, "epoch": 0.6921052631578948, "grad_norm": 0.008808186277747154, "learning_rate": 1e-06, "loss": -0.002, "step": 263 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.29520297050476074, "epoch": 0.6947368421052632, "grad_norm": 0.012514298781752586, "learning_rate": 1e-06, "loss": 0.0058, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 547.052734375, "completions/mean_terminated_length": 484.94708251953125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.29729560017585754, "epoch": 0.6973684210526315, "frac_reward_zero_std": 0.15625, "grad_norm": 0.01363955345004797, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 113121260.0, "reward": 0.5068749189376831, "reward_std": 0.15882600843906403, "rewards/progression_diversity/mean": -0.000983412959612906, "rewards/progression_diversity/std": 0.021069984883069992, "rewards/symbolic_reward_accuracy/mean": 0.447265625, "rewards/symbolic_reward_accuracy/std": 0.4976975917816162, "rewards/symbolic_reward_partial_score/mean": 0.7957357168197632, "rewards/symbolic_reward_partial_score/std": 0.22917680442333221, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0737191438674927, "sampling/importance_sampling_ratio/min": 0.0028329859487712383, "sampling/sampling_logp_difference/max": 5.866424083709717, "sampling/sampling_logp_difference/mean": 0.1374402940273285, "step": 265 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3016761392354965, "epoch": 0.7, "grad_norm": 0.00674022501334548, "learning_rate": 1e-06, "loss": 0.0012, "step": 266 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.30257420241832733, "epoch": 0.7026315789473684, "grad_norm": 0.0077163465321063995, "learning_rate": 1e-06, "loss": 0.0044, "step": 267 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.2949662506580353, "epoch": 0.7052631578947368, "grad_norm": 0.008304405026137829, "learning_rate": 1e-06, "loss": 0.0184, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 940.0, "completions/max_terminated_length": 940.0, "completions/mean_length": 497.12890625, "completions/mean_terminated_length": 497.12890625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.2958240211009979, "epoch": 0.7078947368421052, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01024286262691021, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 113745070.0, "reward": 0.5867675542831421, "reward_std": 0.1438578963279724, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.5703125, "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, "rewards/symbolic_reward_partial_score/mean": 0.8152669668197632, "rewards/symbolic_reward_partial_score/std": 0.25280916690826416, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0769691467285156, "sampling/importance_sampling_ratio/min": 4.805782373296097e-05, "sampling/sampling_logp_difference/max": 9.943105697631836, "sampling/sampling_logp_difference/mean": 0.1453056037425995, "step": 269 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3069736659526825, "epoch": 0.7105263157894737, "grad_norm": 0.010676093399524689, "learning_rate": 1e-06, "loss": 0.0034, "step": 270 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.2979844808578491, "epoch": 0.7131578947368421, "grad_norm": 0.007469307165592909, "learning_rate": 1e-06, "loss": 0.0007, "step": 271 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.2992859333753586, "epoch": 0.7157894736842105, "grad_norm": 0.005593322217464447, "learning_rate": 1e-06, "loss": -0.0011, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1640.0, "completions/mean_length": 567.21484375, "completions/mean_terminated_length": 505.1882629394531, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.2983289957046509, "epoch": 0.718421052631579, "frac_reward_zero_std": 0.125, "grad_norm": 0.014190707355737686, "learning_rate": 1e-06, "loss": 0.0025, "num_tokens": 114454972.0, "reward": 0.4660017192363739, "reward_std": 0.15957707166671753, "rewards/progression_diversity/mean": -0.001394656952470541, "rewards/progression_diversity/std": 0.022416062653064728, "rewards/symbolic_reward_accuracy/mean": 0.400390625, "rewards/symbolic_reward_accuracy/std": 0.4904567301273346, "rewards/symbolic_reward_partial_score/mean": 0.75390625, "rewards/symbolic_reward_partial_score/std": 0.24977906048297882, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07451593875885, "sampling/importance_sampling_ratio/min": 0.0007800398161634803, "sampling/sampling_logp_difference/max": 7.156165599822998, "sampling/sampling_logp_difference/mean": 0.13903775811195374, "step": 273 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.3065921664237976, "epoch": 0.7210526315789474, "grad_norm": 0.005344115197658539, "learning_rate": 1e-06, "loss": -0.0017, "step": 274 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.299715131521225, "epoch": 0.7236842105263158, "grad_norm": 0.007578641176223755, "learning_rate": 1e-06, "loss": -0.0033, "step": 275 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2972417175769806, "epoch": 0.7263157894736842, "grad_norm": 0.00895113404840231, "learning_rate": 1e-06, "loss": 0.0145, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 13534.0, "completions/mean_length": 611.857421875, "completions/mean_terminated_length": 518.8978271484375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.2999434769153595, "epoch": 0.7289473684210527, "frac_reward_zero_std": 0.15625, "grad_norm": 0.011687182821333408, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 115203251.0, "reward": 0.5210317373275757, "reward_std": 0.19981878995895386, "rewards/progression_diversity/mean": -0.001316323410719633, "rewards/progression_diversity/std": 0.021824264898896217, "rewards/symbolic_reward_accuracy/mean": 0.47265625, "rewards/symbolic_reward_accuracy/std": 0.49974003434181213, "rewards/symbolic_reward_partial_score/mean": 0.7921549677848816, "rewards/symbolic_reward_partial_score/std": 0.24679899215698242, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.070167899131775, "sampling/importance_sampling_ratio/min": 0.001091643120162189, "sampling/sampling_logp_difference/max": 6.820071220397949, "sampling/sampling_logp_difference/mean": 0.1327057182788849, "step": 277 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.2990414947271347, "epoch": 0.7315789473684211, "grad_norm": 0.0071852910332381725, "learning_rate": 1e-06, "loss": 0.0094, "step": 278 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2953525185585022, "epoch": 0.7342105263157894, "grad_norm": 0.006853134371340275, "learning_rate": 1e-06, "loss": 0.0074, "step": 279 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.29329265654087067, "epoch": 0.7368421052631579, "grad_norm": 0.012452336959540844, "learning_rate": 1e-06, "loss": 0.0192, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1474.0, "completions/max_terminated_length": 1474.0, "completions/mean_length": 508.8125, "completions/mean_terminated_length": 508.8125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.298539400100708, "epoch": 0.7394736842105263, "frac_reward_zero_std": 0.0625, "grad_norm": 0.015085075981914997, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 115887923.0, "reward": 0.4686523675918579, "reward_std": 0.20140579342842102, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.3984375, "rewards/symbolic_reward_accuracy/std": 0.4900552034378052, "rewards/symbolic_reward_partial_score/mean": 0.7652994394302368, "rewards/symbolic_reward_partial_score/std": 0.2437170445919037, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0764374732971191, "sampling/importance_sampling_ratio/min": 0.0012120020110160112, "sampling/sampling_logp_difference/max": 6.715481758117676, "sampling/sampling_logp_difference/mean": 0.14303654432296753, "step": 281 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3021211475133896, "epoch": 0.7421052631578947, "grad_norm": 0.011208699084818363, "learning_rate": 1e-06, "loss": 0.0015, "step": 282 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.3050876259803772, "epoch": 0.7447368421052631, "grad_norm": 0.010669737122952938, "learning_rate": 1e-06, "loss": -0.0001, "step": 283 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.2993925213813782, "epoch": 0.7473684210526316, "grad_norm": 0.00823670532554388, "learning_rate": 1e-06, "loss": 0.0015, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 520.505859375, "completions/mean_terminated_length": 489.4618225097656, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.31090834736824036, "epoch": 0.75, "frac_reward_zero_std": 0.15625, "grad_norm": 0.010960198938846588, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 116568758.0, "reward": 0.4624403119087219, "reward_std": 0.16659079492092133, "rewards/progression_diversity/mean": -0.0010909016709774733, "rewards/progression_diversity/std": 0.02468428760766983, "rewards/symbolic_reward_accuracy/mean": 0.39453125, "rewards/symbolic_reward_accuracy/std": 0.4892277717590332, "rewards/symbolic_reward_partial_score/mean": 0.7530924677848816, "rewards/symbolic_reward_partial_score/std": 0.25413239002227783, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077139139175415, "sampling/importance_sampling_ratio/min": 0.0007212276686914265, "sampling/sampling_logp_difference/max": 7.234555721282959, "sampling/sampling_logp_difference/mean": 0.14307495951652527, "step": 285 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.30131247639656067, "epoch": 0.7526315789473684, "grad_norm": 0.008542075753211975, "learning_rate": 1e-06, "loss": -0.0027, "step": 286 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.30619797110557556, "epoch": 0.7552631578947369, "grad_norm": 0.010217522270977497, "learning_rate": 1e-06, "loss": 0.0018, "step": 287 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.2946513742208481, "epoch": 0.7578947368421053, "grad_norm": 0.008310631848871708, "learning_rate": 1e-06, "loss": 0.0136, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 567.96875, "completions/mean_terminated_length": 474.7505187988281, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.305755078792572, "epoch": 0.7605263157894737, "frac_reward_zero_std": 0.15625, "grad_norm": 0.012845570221543312, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 117272710.0, "reward": 0.5138547420501709, "reward_std": 0.18507125973701477, "rewards/progression_diversity/mean": -0.0012514127884060144, "rewards/progression_diversity/std": 0.020418958738446236, "rewards/symbolic_reward_accuracy/mean": 0.462890625, "rewards/symbolic_reward_accuracy/std": 0.4991086423397064, "rewards/symbolic_reward_partial_score/mean": 0.7884114980697632, "rewards/symbolic_reward_partial_score/std": 0.24423162639141083, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.073378086090088, "sampling/importance_sampling_ratio/min": 0.0019662980921566486, "sampling/sampling_logp_difference/max": 6.231602668762207, "sampling/sampling_logp_difference/mean": 0.13790735602378845, "step": 289 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.2955671697854996, "epoch": 0.7631578947368421, "grad_norm": 0.007032850757241249, "learning_rate": 1e-06, "loss": 0.0306, "step": 290 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3038811683654785, "epoch": 0.7657894736842106, "grad_norm": 0.012003585696220398, "learning_rate": 1e-06, "loss": 0.0126, "step": 291 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3101692795753479, "epoch": 0.7684210526315789, "grad_norm": 0.006113457027822733, "learning_rate": 1e-06, "loss": -0.0008, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1499.0, "completions/max_terminated_length": 1499.0, "completions/mean_length": 483.70703125, "completions/mean_terminated_length": 483.70703125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3106113076210022, "epoch": 0.7710526315789473, "frac_reward_zero_std": 0.15625, "grad_norm": 0.011551190167665482, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 117922192.0, "reward": 0.4702148735523224, "reward_std": 0.16088829934597015, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.39453125, "rewards/symbolic_reward_accuracy/std": 0.4892277717590332, "rewards/symbolic_reward_partial_score/mean": 0.7783203125, "rewards/symbolic_reward_partial_score/std": 0.23153917491436005, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782867670059204, "sampling/importance_sampling_ratio/min": 5.172830242372584e-06, "sampling/sampling_logp_difference/max": 12.172090530395508, "sampling/sampling_logp_difference/mean": 0.14682671427726746, "step": 293 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3070777803659439, "epoch": 0.7736842105263158, "grad_norm": 0.014432739466428757, "learning_rate": 1e-06, "loss": 0.0002, "step": 294 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.30722641944885254, "epoch": 0.7763157894736842, "grad_norm": 0.007676142267882824, "learning_rate": 1e-06, "loss": 0.0007, "step": 295 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3001379519701004, "epoch": 0.7789473684210526, "grad_norm": 0.007182000204920769, "learning_rate": 1e-06, "loss": 0.0028, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 536.740234375, "completions/mean_terminated_length": 474.5941467285156, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3064696937799454, "epoch": 0.781578947368421, "frac_reward_zero_std": 0.125, "grad_norm": 0.011220471933484077, "learning_rate": 1e-06, "loss": 0.0027, "num_tokens": 118605643.0, "reward": 0.5557116270065308, "reward_std": 0.202279195189476, "rewards/progression_diversity/mean": -0.00012732444156426936, "rewards/progression_diversity/std": 0.002881023334339261, "rewards/symbolic_reward_accuracy/mean": 0.51953125, "rewards/symbolic_reward_accuracy/std": 0.5001069903373718, "rewards/symbolic_reward_partial_score/mean": 0.8146158456802368, "rewards/symbolic_reward_partial_score/std": 0.24289114773273468, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.074270248413086, "sampling/importance_sampling_ratio/min": 0.0009082350297830999, "sampling/sampling_logp_difference/max": 7.004007339477539, "sampling/sampling_logp_difference/mean": 0.13980567455291748, "step": 297 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3012381047010422, "epoch": 0.7842105263157895, "grad_norm": 0.006437814328819513, "learning_rate": 1e-06, "loss": 0.0126, "step": 298 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3034280389547348, "epoch": 0.7868421052631579, "grad_norm": 0.009115688502788544, "learning_rate": 1e-06, "loss": 0.0252, "step": 299 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.2999880909919739, "epoch": 0.7894736842105263, "grad_norm": 0.008743058890104294, "learning_rate": 1e-06, "loss": 0.0004, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 504.83984375, "completions/mean_terminated_length": 473.7651672363281, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.30167509615421295, "epoch": 0.7921052631578948, "frac_reward_zero_std": 0.15625, "grad_norm": 0.013987814076244831, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 119287961.0, "reward": 0.5052167177200317, "reward_std": 0.21000981330871582, "rewards/progression_diversity/mean": -0.0007912339060567319, "rewards/progression_diversity/std": 0.017903579398989677, "rewards/symbolic_reward_accuracy/mean": 0.44921875, "rewards/symbolic_reward_accuracy/std": 0.497901052236557, "rewards/symbolic_reward_partial_score/mean": 0.7862955927848816, "rewards/symbolic_reward_partial_score/std": 0.24358125030994415, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0761265754699707, "sampling/importance_sampling_ratio/min": 0.0016627967124804854, "sampling/sampling_logp_difference/max": 6.399254322052002, "sampling/sampling_logp_difference/mean": 0.14250260591506958, "step": 301 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.30729566514492035, "epoch": 0.7947368421052632, "grad_norm": 0.011272015050053596, "learning_rate": 1e-06, "loss": -0.0021, "step": 302 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.30154599249362946, "epoch": 0.7973684210526316, "grad_norm": 0.007906567305326462, "learning_rate": 1e-06, "loss": 0.0129, "step": 303 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.2421875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.30861087143421173, "epoch": 0.8, "grad_norm": 0.009575615637004375, "learning_rate": 1e-06, "loss": 0.0031, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 475.01171875, "completions/mean_terminated_length": 475.01171875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3128325790166855, "epoch": 0.8026315789473685, "frac_reward_zero_std": 0.1875, "grad_norm": 0.011371531523764133, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 119939647.0, "reward": 0.5709472894668579, "reward_std": 0.1911071240901947, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.541015625, "rewards/symbolic_reward_accuracy/std": 0.49880221486091614, "rewards/symbolic_reward_partial_score/mean": 0.8211263418197632, "rewards/symbolic_reward_partial_score/std": 0.22490404546260834, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0794188976287842, "sampling/importance_sampling_ratio/min": 0.0021937647834420204, "sampling/sampling_logp_difference/max": 6.122136116027832, "sampling/sampling_logp_difference/mean": 0.14725090563297272, "step": 305 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.30579154193401337, "epoch": 0.8052631578947368, "grad_norm": 0.007889126427471638, "learning_rate": 1e-06, "loss": 0.0028, "step": 306 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3117325156927109, "epoch": 0.8078947368421052, "grad_norm": 0.008403713814914227, "learning_rate": 1e-06, "loss": 0.0012, "step": 307 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3067573755979538, "epoch": 0.8105263157894737, "grad_norm": 0.011215373873710632, "learning_rate": 1e-06, "loss": -0.0003, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 470.318359375, "completions/mean_terminated_length": 470.318359375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3116874247789383, "epoch": 0.8131578947368421, "frac_reward_zero_std": 0.28125, "grad_norm": 0.010915424674749374, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 120571586.0, "reward": 0.59326171875, "reward_std": 0.15734408795833588, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.5703125, "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, "rewards/symbolic_reward_partial_score/mean": 0.8369140625, "rewards/symbolic_reward_partial_score/std": 0.21129031479358673, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0776973962783813, "sampling/importance_sampling_ratio/min": 7.561699021607637e-06, "sampling/sampling_logp_difference/max": 11.792414665222168, "sampling/sampling_logp_difference/mean": 0.1471216082572937, "step": 309 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3030817359685898, "epoch": 0.8157894736842105, "grad_norm": 0.005430555436760187, "learning_rate": 1e-06, "loss": -0.0003, "step": 310 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3055662661790848, "epoch": 0.8184210526315789, "grad_norm": 0.007991395890712738, "learning_rate": 1e-06, "loss": 0.0003, "step": 311 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3136454224586487, "epoch": 0.8210526315789474, "grad_norm": 0.008103788830339909, "learning_rate": 1e-06, "loss": 0.0007, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 502.697265625, "completions/mean_terminated_length": 471.6183776855469, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3072304129600525, "epoch": 0.8236842105263158, "frac_reward_zero_std": 0.09375, "grad_norm": 0.017745062708854675, "learning_rate": 1e-06, "loss": 0.0085, "num_tokens": 121238951.0, "reward": 0.5021483898162842, "reward_std": 0.20283925533294678, "rewards/progression_diversity/mean": -5.9837475419044495e-06, "rewards/progression_diversity/std": 0.00013539673818740994, "rewards/symbolic_reward_accuracy/mean": 0.443359375, "rewards/symbolic_reward_accuracy/std": 0.49726733565330505, "rewards/symbolic_reward_partial_score/mean": 0.7877604365348816, "rewards/symbolic_reward_partial_score/std": 0.2305394411087036, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0763132572174072, "sampling/importance_sampling_ratio/min": 0.00018456965335644782, "sampling/sampling_logp_difference/max": 8.59748363494873, "sampling/sampling_logp_difference/mean": 0.14485155045986176, "step": 313 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.30788813531398773, "epoch": 0.8263157894736842, "grad_norm": 0.008959567174315453, "learning_rate": 1e-06, "loss": -0.0008, "step": 314 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.30733516812324524, "epoch": 0.8289473684210527, "grad_norm": 0.008335716091096401, "learning_rate": 1e-06, "loss": 0.0009, "step": 315 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.484375, "entropy": 0.30802886188030243, "epoch": 0.8315789473684211, "grad_norm": 0.007674569729715586, "learning_rate": 1e-06, "loss": 0.0006, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 470.5078125, "completions/mean_terminated_length": 470.5078125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3161182999610901, "epoch": 0.8342105263157895, "frac_reward_zero_std": 0.15625, "grad_norm": 0.015065652318298817, "learning_rate": 1e-06, "loss": -0.0028, "num_tokens": 121902347.0, "reward": 0.4985351860523224, "reward_std": 0.1782730519771576, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.43359375, "rewards/symbolic_reward_accuracy/std": 0.4960552453994751, "rewards/symbolic_reward_partial_score/mean": 0.7945963144302368, "rewards/symbolic_reward_partial_score/std": 0.22836259007453918, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0780564546585083, "sampling/importance_sampling_ratio/min": 3.911755629815161e-05, "sampling/sampling_logp_difference/max": 10.14893913269043, "sampling/sampling_logp_difference/mean": 0.14812889695167542, "step": 317 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3078267425298691, "epoch": 0.8368421052631579, "grad_norm": 0.006529625505208969, "learning_rate": 1e-06, "loss": 0.0029, "step": 318 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3183140605688095, "epoch": 0.8394736842105263, "grad_norm": 0.005689030978828669, "learning_rate": 1e-06, "loss": 0.0001, "step": 319 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.30473969876766205, "epoch": 0.8421052631578947, "grad_norm": 0.013739674352109432, "learning_rate": 1e-06, "loss": 0.0024, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 588.193359375, "completions/mean_terminated_length": 463.8169250488281, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.3014228492975235, "epoch": 0.8447368421052631, "frac_reward_zero_std": 0.21875, "grad_norm": 0.011865752749145031, "learning_rate": 1e-06, "loss": 0.0318, "num_tokens": 122591950.0, "reward": 0.631678581237793, "reward_std": 0.19821570813655853, "rewards/progression_diversity/mean": -0.001093248138204217, "rewards/progression_diversity/std": 0.02473738044500351, "rewards/symbolic_reward_accuracy/mean": 0.625, "rewards/symbolic_reward_accuracy/std": 0.4845963716506958, "rewards/symbolic_reward_partial_score/mean": 0.8582357168197632, "rewards/symbolic_reward_partial_score/std": 0.21694619953632355, "rewards/tag_count_reward/mean": -0.0078125, "rewards/tag_count_reward/std": 0.08812850713729858, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0716766119003296, "sampling/importance_sampling_ratio/min": 0.0003383133444003761, "sampling/sampling_logp_difference/max": 7.991538047790527, "sampling/sampling_logp_difference/mean": 0.13535773754119873, "step": 321 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.3073635548353195, "epoch": 0.8473684210526315, "grad_norm": 0.011415022425353527, "learning_rate": 1e-06, "loss": 0.0218, "step": 322 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31290285289287567, "epoch": 0.85, "grad_norm": 0.007157693617045879, "learning_rate": 1e-06, "loss": -0.0026, "step": 323 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3036612421274185, "epoch": 0.8526315789473684, "grad_norm": 0.01231673825532198, "learning_rate": 1e-06, "loss": 0.0094, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 493.33984375, "completions/mean_terminated_length": 462.2426452636719, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.3125806152820587, "epoch": 0.8552631578947368, "frac_reward_zero_std": 0.125, "grad_norm": 0.013022052124142647, "learning_rate": 1e-06, "loss": 0.0026, "num_tokens": 123257532.0, "reward": 0.5332888960838318, "reward_std": 0.1853906512260437, "rewards/progression_diversity/mean": -0.0011929721804335713, "rewards/progression_diversity/std": 0.026993878185749054, "rewards/symbolic_reward_accuracy/mean": 0.4921875, "rewards/symbolic_reward_accuracy/std": 0.5004279017448425, "rewards/symbolic_reward_partial_score/mean": 0.7939453125, "rewards/symbolic_reward_partial_score/std": 0.2435741126537323, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0774052143096924, "sampling/importance_sampling_ratio/min": 0.0012375212972983718, "sampling/sampling_logp_difference/max": 6.694644927978516, "sampling/sampling_logp_difference/mean": 0.14501085877418518, "step": 325 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3106594383716583, "epoch": 0.8578947368421053, "grad_norm": 0.005496948957443237, "learning_rate": 1e-06, "loss": -0.0007, "step": 326 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3180868923664093, "epoch": 0.8605263157894737, "grad_norm": 0.007096852641552687, "learning_rate": 1e-06, "loss": -0.0002, "step": 327 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.421875, "entropy": 0.30386729538440704, "epoch": 0.8631578947368421, "grad_norm": 0.009813301265239716, "learning_rate": 1e-06, "loss": 0.0258, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 504.208984375, "completions/mean_terminated_length": 473.133056640625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.319959819316864, "epoch": 0.8657894736842106, "frac_reward_zero_std": 0.15625, "grad_norm": 0.012554957531392574, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 123914823.0, "reward": 0.5417969226837158, "reward_std": 0.17694982886314392, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.5, "rewards/symbolic_reward_accuracy/std": 0.5004889965057373, "rewards/symbolic_reward_partial_score/mean": 0.806640625, "rewards/symbolic_reward_partial_score/std": 0.24059468507766724, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078836441040039, "sampling/importance_sampling_ratio/min": 5.7318637118441984e-05, "sampling/sampling_logp_difference/max": 9.766884803771973, "sampling/sampling_logp_difference/mean": 0.14754198491573334, "step": 329 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3119961619377136, "epoch": 0.868421052631579, "grad_norm": 0.012830966152250767, "learning_rate": 1e-06, "loss": 0.0273, "step": 330 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.32057128846645355, "epoch": 0.8710526315789474, "grad_norm": 0.008462558500468731, "learning_rate": 1e-06, "loss": -0.0005, "step": 331 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.390625, "entropy": 0.3068248778581619, "epoch": 0.8736842105263158, "grad_norm": 0.009940098971128464, "learning_rate": 1e-06, "loss": 0.0014, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1060.0, "completions/mean_length": 487.529296875, "completions/mean_terminated_length": 456.4207458496094, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.314155712723732, "epoch": 0.8763157894736842, "frac_reward_zero_std": 0.125, "grad_norm": 0.015469806268811226, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 124568246.0, "reward": 0.5467255115509033, "reward_std": 0.22133252024650574, "rewards/progression_diversity/mean": -0.0002992129884660244, "rewards/progression_diversity/std": 0.0067704166285693645, "rewards/symbolic_reward_accuracy/mean": 0.50390625, "rewards/symbolic_reward_accuracy/std": 0.5004737377166748, "rewards/symbolic_reward_partial_score/mean": 0.8152669668197632, "rewards/symbolic_reward_partial_score/std": 0.22244098782539368, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076984167098999, "sampling/importance_sampling_ratio/min": 0.0020317258313298225, "sampling/sampling_logp_difference/max": 6.198869705200195, "sampling/sampling_logp_difference/mean": 0.14370480179786682, "step": 333 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.31249886751174927, "epoch": 0.8789473684210526, "grad_norm": 0.009735578671097755, "learning_rate": 1e-06, "loss": 0.0035, "step": 334 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.2265625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4140625, "entropy": 0.30799588561058044, "epoch": 0.881578947368421, "grad_norm": 0.008018133230507374, "learning_rate": 1e-06, "loss": 0.0031, "step": 335 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.203125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.30934616923332214, "epoch": 0.8842105263157894, "grad_norm": 0.00663616880774498, "learning_rate": 1e-06, "loss": -0.002, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 471.818359375, "completions/mean_terminated_length": 471.818359375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.3062121123075485, "epoch": 0.8868421052631579, "frac_reward_zero_std": 0.25, "grad_norm": 0.009202565997838974, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 125208089.0, "reward": 0.6447754502296448, "reward_std": 0.17742466926574707, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.640625, "rewards/symbolic_reward_accuracy/std": 0.48028653860092163, "rewards/symbolic_reward_partial_score/mean": 0.8680013418197632, "rewards/symbolic_reward_partial_score/std": 0.21553559601306915, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0774188041687012, "sampling/importance_sampling_ratio/min": 2.0920735551044345e-05, "sampling/sampling_logp_difference/max": 10.77476978302002, "sampling/sampling_logp_difference/mean": 0.1450258493423462, "step": 337 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3057921230792999, "epoch": 0.8894736842105263, "grad_norm": 0.008431348949670792, "learning_rate": 1e-06, "loss": 0.0051, "step": 338 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3095565140247345, "epoch": 0.8921052631578947, "grad_norm": 0.005716415587812662, "learning_rate": 1e-06, "loss": 0.0012, "step": 339 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3105488568544388, "epoch": 0.8947368421052632, "grad_norm": 0.0054482058621943, "learning_rate": 1e-06, "loss": -0.0037, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1476.0, "completions/mean_length": 529.880859375, "completions/mean_terminated_length": 467.7078857421875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3118719011545181, "epoch": 0.8973684210526316, "frac_reward_zero_std": 0.1875, "grad_norm": 0.014669405296444893, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 125877020.0, "reward": 0.533341646194458, "reward_std": 0.16296130418777466, "rewards/progression_diversity/mean": -0.000796740350779146, "rewards/progression_diversity/std": 0.018028177320957184, "rewards/symbolic_reward_accuracy/mean": 0.482421875, "rewards/symbolic_reward_accuracy/std": 0.5001795887947083, "rewards/symbolic_reward_partial_score/mean": 0.81494140625, "rewards/symbolic_reward_partial_score/std": 0.22069676220417023, "rewards/tag_count_reward/mean": -0.005859375, "rewards/tag_count_reward/std": 0.07639661431312561, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0750058889389038, "sampling/importance_sampling_ratio/min": 0.000702597841154784, "sampling/sampling_logp_difference/max": 7.260725975036621, "sampling/sampling_logp_difference/mean": 0.14171919226646423, "step": 341 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.307451456785202, "epoch": 0.9, "grad_norm": 0.01112651638686657, "learning_rate": 1e-06, "loss": 0.0109, "step": 342 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.31406456232070923, "epoch": 0.9026315789473685, "grad_norm": 0.010442834347486496, "learning_rate": 1e-06, "loss": 0.0, "step": 343 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2734375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.30461055040359497, "epoch": 0.9052631578947369, "grad_norm": 0.011064223945140839, "learning_rate": 1e-06, "loss": 0.0225, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 537.166015625, "completions/mean_terminated_length": 475.0216064453125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.3110172599554062, "epoch": 0.9078947368421053, "frac_reward_zero_std": 0.15625, "grad_norm": 0.011030428111553192, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 126580721.0, "reward": 0.48596975207328796, "reward_std": 0.14998123049736023, "rewards/progression_diversity/mean": -0.0016597331268712878, "rewards/progression_diversity/std": 0.03166636824607849, "rewards/symbolic_reward_accuracy/mean": 0.41015625, "rewards/symbolic_reward_accuracy/std": 0.49234291911125183, "rewards/symbolic_reward_partial_score/mean": 0.8009440302848816, "rewards/symbolic_reward_partial_score/std": 0.20570434629917145, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075448751449585, "sampling/importance_sampling_ratio/min": 2.118588236044161e-05, "sampling/sampling_logp_difference/max": 10.762175559997559, "sampling/sampling_logp_difference/mean": 0.14029094576835632, "step": 345 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3086661100387573, "epoch": 0.9105263157894737, "grad_norm": 0.011540532112121582, "learning_rate": 1e-06, "loss": 0.005, "step": 346 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.2966995984315872, "epoch": 0.9131578947368421, "grad_norm": 0.01433405838906765, "learning_rate": 1e-06, "loss": 0.0187, "step": 347 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2890625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4296875, "entropy": 0.31083013117313385, "epoch": 0.9157894736842105, "grad_norm": 0.008436622098088264, "learning_rate": 1e-06, "loss": -0.0028, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 480.43359375, "completions/mean_terminated_length": 480.43359375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.309502974152565, "epoch": 0.9184210526315789, "frac_reward_zero_std": 0.21875, "grad_norm": 0.011041209101676941, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 127224495.0, "reward": 0.5547363758087158, "reward_std": 0.1895267367362976, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.5078125, "rewards/symbolic_reward_accuracy/std": 0.5004279017448425, "rewards/symbolic_reward_partial_score/mean": 0.83349609375, "rewards/symbolic_reward_partial_score/std": 0.2128904014825821, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0771770477294922, "sampling/importance_sampling_ratio/min": 3.5030089406973275e-07, "sampling/sampling_logp_difference/max": 14.864473342895508, "sampling/sampling_logp_difference/mean": 0.1465146243572235, "step": 349 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31064388155937195, "epoch": 0.9210526315789473, "grad_norm": 0.00748514523729682, "learning_rate": 1e-06, "loss": 0.002, "step": 350 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3066399097442627, "epoch": 0.9236842105263158, "grad_norm": 0.010583535768091679, "learning_rate": 1e-06, "loss": -0.0031, "step": 351 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.3035551458597183, "epoch": 0.9263157894736842, "grad_norm": 0.006878511048853397, "learning_rate": 1e-06, "loss": 0.0023, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 486.05078125, "completions/mean_terminated_length": 486.05078125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "entropy": 0.3168664276599884, "epoch": 0.9289473684210526, "frac_reward_zero_std": 0.25, "grad_norm": 0.01184882689267397, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 127881993.0, "reward": 0.5418945550918579, "reward_std": 0.17704787850379944, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.490234375, "rewards/symbolic_reward_accuracy/std": 0.5003935098648071, "rewards/symbolic_reward_partial_score/mean": 0.8264973759651184, "rewards/symbolic_reward_partial_score/std": 0.21585585176944733, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782511234283447, "sampling/importance_sampling_ratio/min": 0.00025796302361413836, "sampling/sampling_logp_difference/max": 8.262694358825684, "sampling/sampling_logp_difference/mean": 0.14865058660507202, "step": 353 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.30700162053108215, "epoch": 0.9315789473684211, "grad_norm": 0.005886297207325697, "learning_rate": 1e-06, "loss": -0.0009, "step": 354 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.21875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3129795789718628, "epoch": 0.9342105263157895, "grad_norm": 0.01028487179428339, "learning_rate": 1e-06, "loss": 0.0019, "step": 355 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3170662522315979, "epoch": 0.9368421052631579, "grad_norm": 0.007761337794363499, "learning_rate": 1e-06, "loss": 0.0002, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 545.998046875, "completions/mean_terminated_length": 483.8882751464844, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.30055612325668335, "epoch": 0.9394736842105263, "frac_reward_zero_std": 0.28125, "grad_norm": 0.012874174863100052, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 128565512.0, "reward": 0.6208984851837158, "reward_std": 0.15273459255695343, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.609375, "rewards/symbolic_reward_accuracy/std": 0.48836761713027954, "rewards/symbolic_reward_partial_score/mean": 0.8522135615348816, "rewards/symbolic_reward_partial_score/std": 0.22056303918361664, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075897216796875, "sampling/importance_sampling_ratio/min": 0.0001214630319736898, "sampling/sampling_logp_difference/max": 9.015900611877441, "sampling/sampling_logp_difference/mean": 0.14393819868564606, "step": 357 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.31270015239715576, "epoch": 0.9421052631578948, "grad_norm": 0.007534582633525133, "learning_rate": 1e-06, "loss": -0.0021, "step": 358 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3103422075510025, "epoch": 0.9447368421052632, "grad_norm": 0.008300635032355785, "learning_rate": 1e-06, "loss": 0.0024, "step": 359 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31079424917697906, "epoch": 0.9473684210526315, "grad_norm": 0.007545569911599159, "learning_rate": 1e-06, "loss": 0.0115, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 517.212890625, "completions/mean_terminated_length": 486.16241455078125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.30875363945961, "epoch": 0.95, "frac_reward_zero_std": 0.25, "grad_norm": 0.007635221816599369, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 129216469.0, "reward": 0.5335408449172974, "reward_std": 0.1456352025270462, "rewards/progression_diversity/mean": -0.00040900660678744316, "rewards/progression_diversity/std": 0.009254762902855873, "rewards/symbolic_reward_accuracy/mean": 0.474609375, "rewards/symbolic_reward_accuracy/std": 0.4998432695865631, "rewards/symbolic_reward_partial_score/mean": 0.8299154043197632, "rewards/symbolic_reward_partial_score/std": 0.20124566555023193, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076637864112854, "sampling/importance_sampling_ratio/min": 1.8904121361629223e-06, "sampling/sampling_logp_difference/max": 13.178715705871582, "sampling/sampling_logp_difference/mean": 0.14335882663726807, "step": 361 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.30543169379234314, "epoch": 0.9526315789473684, "grad_norm": 0.010882689617574215, "learning_rate": 1e-06, "loss": 0.0136, "step": 362 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3083820194005966, "epoch": 0.9552631578947368, "grad_norm": 0.01051326934248209, "learning_rate": 1e-06, "loss": 0.0002, "step": 363 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30124035477638245, "epoch": 0.9578947368421052, "grad_norm": 0.006582474801689386, "learning_rate": 1e-06, "loss": 0.0014, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 473.248046875, "completions/mean_terminated_length": 473.248046875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.30113451182842255, "epoch": 0.9605263157894737, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01765936240553856, "learning_rate": 1e-06, "loss": 0.0037, "num_tokens": 129851252.0, "reward": 0.562792956829071, "reward_std": 0.1252935379743576, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.5234375, "rewards/symbolic_reward_accuracy/std": 0.49993884563446045, "rewards/symbolic_reward_partial_score/mean": 0.8291015625, "rewards/symbolic_reward_partial_score/std": 0.22126927971839905, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0775682926177979, "sampling/importance_sampling_ratio/min": 0.0008246903889812529, "sampling/sampling_logp_difference/max": 7.1005024909973145, "sampling/sampling_logp_difference/mean": 0.14582598209381104, "step": 365 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3071126341819763, "epoch": 0.9631578947368421, "grad_norm": 0.009068193845450878, "learning_rate": 1e-06, "loss": 0.0018, "step": 366 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.30289457738399506, "epoch": 0.9657894736842105, "grad_norm": 0.004140099510550499, "learning_rate": 1e-06, "loss": -0.0028, "step": 367 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3062494695186615, "epoch": 0.968421052631579, "grad_norm": 0.007102786097675562, "learning_rate": 1e-06, "loss": -0.0007, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 469.71484375, "completions/mean_terminated_length": 469.71484375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3085251748561859, "epoch": 0.9710526315789474, "frac_reward_zero_std": 0.25, "grad_norm": 0.010728326626121998, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 130498562.0, "reward": 0.5833496451377869, "reward_std": 0.17793656885623932, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.546875, "rewards/symbolic_reward_accuracy/std": 0.4982847273349762, "rewards/symbolic_reward_partial_score/mean": 0.8507487177848816, "rewards/symbolic_reward_partial_score/std": 0.19178888201713562, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768487453460693, "sampling/importance_sampling_ratio/min": 0.0007275677635334432, "sampling/sampling_logp_difference/max": 7.225803375244141, "sampling/sampling_logp_difference/mean": 0.1458495855331421, "step": 369 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30653318762779236, "epoch": 0.9736842105263158, "grad_norm": 0.008490893058478832, "learning_rate": 1e-06, "loss": -0.0006, "step": 370 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.30405572056770325, "epoch": 0.9763157894736842, "grad_norm": 0.0068202815018594265, "learning_rate": 1e-06, "loss": 0.0006, "step": 371 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.30216623842716217, "epoch": 0.9789473684210527, "grad_norm": 0.013265615329146385, "learning_rate": 1e-06, "loss": -0.0006, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 503.1953125, "completions/mean_terminated_length": 472.1174011230469, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.30290573835372925, "epoch": 0.9815789473684211, "frac_reward_zero_std": 0.21875, "grad_norm": 0.014201073907315731, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 131153990.0, "reward": 0.6204589605331421, "reward_std": 0.204176664352417, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.6015625, "rewards/symbolic_reward_accuracy/std": 0.4900552034378052, "rewards/symbolic_reward_partial_score/mean": 0.86572265625, "rewards/symbolic_reward_partial_score/std": 0.19912473857402802, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0761308670043945, "sampling/importance_sampling_ratio/min": 0.0017201791051775217, "sampling/sampling_logp_difference/max": 6.365326881408691, "sampling/sampling_logp_difference/mean": 0.1428317129611969, "step": 373 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3077033460140228, "epoch": 0.9842105263157894, "grad_norm": 0.007973029278218746, "learning_rate": 1e-06, "loss": 0.0029, "step": 374 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.2994297742843628, "epoch": 0.9868421052631579, "grad_norm": 0.006763116456568241, "learning_rate": 1e-06, "loss": 0.0263, "step": 375 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.304544135928154, "epoch": 0.9894736842105263, "grad_norm": 0.007952687330543995, "learning_rate": 1e-06, "loss": -0.0006, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 471.927734375, "completions/mean_terminated_length": 471.927734375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.3087266832590103, "epoch": 0.9921052631578947, "frac_reward_zero_std": 0.28125, "grad_norm": 0.011059445329010487, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 131803233.0, "reward": 0.6175293326377869, "reward_std": 0.15380731225013733, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.59765625, "rewards/symbolic_reward_accuracy/std": 0.4908501207828522, "rewards/symbolic_reward_partial_score/mean": 0.8631184697151184, "rewards/symbolic_reward_partial_score/std": 0.20576836168766022, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076733112335205, "sampling/importance_sampling_ratio/min": 0.00020886259153485298, "sampling/sampling_logp_difference/max": 8.473834037780762, "sampling/sampling_logp_difference/mean": 0.14535820484161377, "step": 377 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.30368588864803314, "epoch": 0.9947368421052631, "grad_norm": 0.007541055791079998, "learning_rate": 1e-06, "loss": -0.0002, "step": 378 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.30102120339870453, "epoch": 0.9973684210526316, "grad_norm": 0.008830598555505276, "learning_rate": 1e-06, "loss": 0.0003, "step": 379 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.30921386182308197, "epoch": 1.0, "grad_norm": 0.008218697272241116, "learning_rate": 1e-06, "loss": 0.0027, "step": 380 }, { "epoch": 1.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.002685546875, "eval_completions/max_length": 4941.03125, "eval_completions/max_terminated_length": 1225.71875, "eval_completions/mean_length": 509.00732421875, "eval_completions/mean_terminated_length": 466.2677412033081, "eval_completions/min_length": 191.28125, "eval_completions/min_terminated_length": 191.28125, "eval_entropy": 0.30122990906238556, "eval_frac_reward_zero_std": 0.265625, "eval_loss": 0.00210479530505836, "eval_num_tokens": 131803233.0, "eval_reward": 0.6341025996953249, "eval_reward_std": 0.17670447006821632, "eval_rewards/progression_diversity/mean": -0.0003870166310662171, "eval_rewards/progression_diversity/std": 0.003330339086460299, "eval_rewards/symbolic_reward_accuracy/mean": 0.62890625, "eval_rewards/symbolic_reward_accuracy/std": 0.46045348327606916, "eval_rewards/symbolic_reward_partial_score/mean": 0.8587239664047956, "eval_rewards/symbolic_reward_partial_score/std": 0.20982443122193217, "eval_rewards/tag_count_reward/mean": -0.008544921875, "eval_rewards/tag_count_reward/std": 0.05361618706956506, "eval_runtime": 573.6785, "eval_samples_per_second": 0.436, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0764745026826859, "eval_sampling/importance_sampling_ratio/min": 0.0018949521987123776, "eval_sampling/sampling_logp_difference/max": 19.122781857848167, "eval_sampling/sampling_logp_difference/mean": 0.14713718881830573, "eval_steps_per_second": 0.003, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 897.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 453.083984375, "completions/mean_terminated_length": 453.083984375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.31483590602874756, "epoch": 1.0026315789473683, "frac_reward_zero_std": 0.21875, "grad_norm": 0.011226167902350426, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 132457548.0, "reward": 0.536572277545929, "reward_std": 0.17151208221912384, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.484375, "rewards/symbolic_reward_accuracy/std": 0.5002445578575134, "rewards/symbolic_reward_partial_score/mean": 0.81982421875, "rewards/symbolic_reward_partial_score/std": 0.2147509604692459, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0785313844680786, "sampling/importance_sampling_ratio/min": 4.000367334811017e-05, "sampling/sampling_logp_difference/max": 10.12653923034668, "sampling/sampling_logp_difference/mean": 0.1462843418121338, "step": 381 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.30385373532772064, "epoch": 1.0052631578947369, "grad_norm": 0.007102385628968477, "learning_rate": 1e-06, "loss": 0.0008, "step": 382 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.3114987909793854, "epoch": 1.0078947368421052, "grad_norm": 0.010028541088104248, "learning_rate": 1e-06, "loss": -0.0008, "step": 383 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.30750955641269684, "epoch": 1.0105263157894737, "grad_norm": 0.007849895395338535, "learning_rate": 1e-06, "loss": -0.0004, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 487.294921875, "completions/mean_terminated_length": 456.1859130859375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "entropy": 0.30419932305812836, "epoch": 1.013157894736842, "frac_reward_zero_std": 0.09375, "grad_norm": 0.00997752696275711, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 133121859.0, "reward": 0.5177246332168579, "reward_std": 0.16577917337417603, "rewards/progression_diversity/mean": -1.0128132998943329e-07, "rewards/progression_diversity/std": 2.2917349724593805e-06, "rewards/symbolic_reward_accuracy/mean": 0.453125, "rewards/symbolic_reward_accuracy/std": 0.4982847273349762, "rewards/symbolic_reward_partial_score/mean": 0.8201497793197632, "rewards/symbolic_reward_partial_score/std": 0.19332556426525116, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0750830173492432, "sampling/importance_sampling_ratio/min": 3.3023450669134036e-05, "sampling/sampling_logp_difference/max": 10.318292617797852, "sampling/sampling_logp_difference/mean": 0.14270050823688507, "step": 385 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4609375, "entropy": 0.301358163356781, "epoch": 1.0157894736842106, "grad_norm": 0.007478445768356323, "learning_rate": 1e-06, "loss": 0.0014, "step": 386 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.3203125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.46875, "entropy": 0.3112320601940155, "epoch": 1.018421052631579, "grad_norm": 0.01002897322177887, "learning_rate": 1e-06, "loss": -0.0012, "step": 387 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.2982504069805145, "epoch": 1.0210526315789474, "grad_norm": 0.008274735882878304, "learning_rate": 1e-06, "loss": 0.03, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 446.642578125, "completions/mean_terminated_length": 446.642578125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.30985428392887115, "epoch": 1.0236842105263158, "frac_reward_zero_std": 0.4375, "grad_norm": 0.007278556935489178, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 133741996.0, "reward": 0.6395508050918579, "reward_std": 0.09756526350975037, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.625, "rewards/symbolic_reward_accuracy/std": 0.4845963716506958, "rewards/symbolic_reward_partial_score/mean": 0.8818359375, "rewards/symbolic_reward_partial_score/std": 0.18144503235816956, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0790255069732666, "sampling/importance_sampling_ratio/min": 0.0008311926503665745, "sampling/sampling_logp_difference/max": 7.092648983001709, "sampling/sampling_logp_difference/mean": 0.1477079838514328, "step": 389 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3042980283498764, "epoch": 1.0263157894736843, "grad_norm": 0.009012778289616108, "learning_rate": 1e-06, "loss": 0.0034, "step": 390 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.313108429312706, "epoch": 1.0289473684210526, "grad_norm": 0.00844503566622734, "learning_rate": 1e-06, "loss": 0.0003, "step": 391 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3174958974123001, "epoch": 1.0315789473684212, "grad_norm": 0.01085878349840641, "learning_rate": 1e-06, "loss": -0.0024, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 441.478515625, "completions/mean_terminated_length": 441.478515625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.31333790719509125, "epoch": 1.0342105263157895, "frac_reward_zero_std": 0.28125, "grad_norm": 0.008390745148062706, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 134371841.0, "reward": 0.5450682640075684, "reward_std": 0.11910407245159149, "rewards/progression_diversity/mean": -7.797831131028943e-06, "rewards/progression_diversity/std": 0.00017644476611167192, "rewards/symbolic_reward_accuracy/mean": 0.490234375, "rewards/symbolic_reward_accuracy/std": 0.5003935098648071, "rewards/symbolic_reward_partial_score/mean": 0.83642578125, "rewards/symbolic_reward_partial_score/std": 0.18248093128204346, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078395128250122, "sampling/importance_sampling_ratio/min": 7.312164962058887e-05, "sampling/sampling_logp_difference/max": 9.523386001586914, "sampling/sampling_logp_difference/mean": 0.14865905046463013, "step": 393 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3087403327226639, "epoch": 1.0368421052631578, "grad_norm": 0.007888459600508213, "learning_rate": 1e-06, "loss": -0.0029, "step": 394 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3131715953350067, "epoch": 1.0394736842105263, "grad_norm": 0.008880356326699257, "learning_rate": 1e-06, "loss": 0.0029, "step": 395 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3134143352508545, "epoch": 1.0421052631578946, "grad_norm": 0.006940594408661127, "learning_rate": 1e-06, "loss": 0.0036, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 442.345703125, "completions/mean_terminated_length": 442.345703125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3102358728647232, "epoch": 1.0447368421052632, "frac_reward_zero_std": 0.28125, "grad_norm": 0.007839059457182884, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 134999282.0, "reward": 0.622314453125, "reward_std": 0.1633392572402954, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.60546875, "rewards/symbolic_reward_accuracy/std": 0.4892277717590332, "rewards/symbolic_reward_partial_score/mean": 0.8634440302848816, "rewards/symbolic_reward_partial_score/std": 0.20192034542560577, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07792067527771, "sampling/importance_sampling_ratio/min": 6.452879915741505e-06, "sampling/sampling_logp_difference/max": 11.950984001159668, "sampling/sampling_logp_difference/mean": 0.14813551306724548, "step": 397 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.31489740312099457, "epoch": 1.0473684210526315, "grad_norm": 0.009207435883581638, "learning_rate": 1e-06, "loss": 0.0004, "step": 398 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.31071890890598297, "epoch": 1.05, "grad_norm": 0.006323720328509808, "learning_rate": 1e-06, "loss": 0.0004, "step": 399 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.31255054473876953, "epoch": 1.0526315789473684, "grad_norm": 0.015162119641900063, "learning_rate": 1e-06, "loss": -0.0003, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 425.5390625, "completions/mean_terminated_length": 425.5390625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "entropy": 0.3116391748189926, "epoch": 1.055263157894737, "frac_reward_zero_std": 0.25, "grad_norm": 0.011498549953103065, "learning_rate": 1e-06, "loss": 0.0045, "num_tokens": 135611462.0, "reward": 0.6023924350738525, "reward_std": 0.1811559945344925, "rewards/progression_diversity/mean": -1.2093556506442837e-05, "rewards/progression_diversity/std": 0.00027364594279788435, "rewards/symbolic_reward_accuracy/mean": 0.578125, "rewards/symbolic_reward_accuracy/std": 0.49434176087379456, "rewards/symbolic_reward_partial_score/mean": 0.8517252802848816, "rewards/symbolic_reward_partial_score/std": 0.20265617966651917, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078771710395813, "sampling/importance_sampling_ratio/min": 8.55166106816796e-08, "sampling/sampling_logp_difference/max": 16.274555206298828, "sampling/sampling_logp_difference/mean": 0.14964525401592255, "step": 401 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.31484755873680115, "epoch": 1.0578947368421052, "grad_norm": 0.007245294749736786, "learning_rate": 1e-06, "loss": -0.0002, "step": 402 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.31309810280799866, "epoch": 1.0605263157894738, "grad_norm": 0.005314267706125975, "learning_rate": 1e-06, "loss": -0.0028, "step": 403 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.3142850995063782, "epoch": 1.063157894736842, "grad_norm": 0.012064156122505665, "learning_rate": 1e-06, "loss": 0.0003, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1077.0, "completions/max_terminated_length": 1077.0, "completions/mean_length": 444.736328125, "completions/mean_terminated_length": 444.736328125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.31040342152118683, "epoch": 1.0657894736842106, "frac_reward_zero_std": 0.25, "grad_norm": 0.014743371866643429, "learning_rate": 1e-06, "loss": 0.0023, "num_tokens": 136253823.0, "reward": 0.571484386920929, "reward_std": 0.1715823858976364, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.537109375, "rewards/symbolic_reward_accuracy/std": 0.4991086423397064, "rewards/symbolic_reward_partial_score/mean": 0.8307291865348816, "rewards/symbolic_reward_partial_score/std": 0.21525520086288452, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07864248752594, "sampling/importance_sampling_ratio/min": 2.170778117260852e-07, "sampling/sampling_logp_difference/max": 15.343009948730469, "sampling/sampling_logp_difference/mean": 0.1498403698205948, "step": 405 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3129070848226547, "epoch": 1.068421052631579, "grad_norm": 0.008034562692046165, "learning_rate": 1e-06, "loss": -0.0028, "step": 406 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3151528090238571, "epoch": 1.0710526315789473, "grad_norm": 0.007343663834035397, "learning_rate": 1e-06, "loss": -0.0021, "step": 407 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31475599110126495, "epoch": 1.0736842105263158, "grad_norm": 0.009991384111344814, "learning_rate": 1e-06, "loss": 0.0044, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 439.943359375, "completions/mean_terminated_length": 439.943359375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.31397488713264465, "epoch": 1.0763157894736841, "frac_reward_zero_std": 0.375, "grad_norm": 0.009094743058085442, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 136884386.0, "reward": 0.552441418170929, "reward_std": 0.1292879283428192, "rewards/progression_diversity/mean": -7.611899377479858e-07, "rewards/progression_diversity/std": 1.722376146062743e-05, "rewards/symbolic_reward_accuracy/mean": 0.5, "rewards/symbolic_reward_accuracy/std": 0.5004889965057373, "rewards/symbolic_reward_partial_score/mean": 0.8414713144302368, "rewards/symbolic_reward_partial_score/std": 0.19328801333904266, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0790287256240845, "sampling/importance_sampling_ratio/min": 0.0020058066584169865, "sampling/sampling_logp_difference/max": 6.211709022521973, "sampling/sampling_logp_difference/mean": 0.14997223019599915, "step": 409 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3111269772052765, "epoch": 1.0789473684210527, "grad_norm": 0.010494858026504517, "learning_rate": 1e-06, "loss": -0.0016, "step": 410 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.31543155014514923, "epoch": 1.081578947368421, "grad_norm": 0.006056911777704954, "learning_rate": 1e-06, "loss": -0.0008, "step": 411 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3128467947244644, "epoch": 1.0842105263157895, "grad_norm": 0.0047110323794186115, "learning_rate": 1e-06, "loss": 0.0043, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 458.69921875, "completions/mean_terminated_length": 427.53424072265625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.3135870397090912, "epoch": 1.0868421052631578, "frac_reward_zero_std": 0.1875, "grad_norm": 0.008776719681918621, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 137529544.0, "reward": 0.5720604062080383, "reward_std": 0.18176168203353882, "rewards/progression_diversity/mean": -0.0009923388715833426, "rewards/progression_diversity/std": 0.022347548976540565, "rewards/symbolic_reward_accuracy/mean": 0.53515625, "rewards/symbolic_reward_accuracy/std": 0.49925029277801514, "rewards/symbolic_reward_partial_score/mean": 0.8372396230697632, "rewards/symbolic_reward_partial_score/std": 0.20521557331085205, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0783798694610596, "sampling/importance_sampling_ratio/min": 3.016177743120352e-06, "sampling/sampling_logp_difference/max": 12.711520195007324, "sampling/sampling_logp_difference/mean": 0.14703714847564697, "step": 413 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.315027192234993, "epoch": 1.0894736842105264, "grad_norm": 0.007484138943254948, "learning_rate": 1e-06, "loss": 0.0056, "step": 414 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.31333404779434204, "epoch": 1.0921052631578947, "grad_norm": 0.006227640900760889, "learning_rate": 1e-06, "loss": -0.004, "step": 415 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.30919110774993896, "epoch": 1.0947368421052632, "grad_norm": 0.04769471660256386, "learning_rate": 1e-06, "loss": 0.0281, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 436.810546875, "completions/mean_terminated_length": 436.810546875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.31040582060813904, "epoch": 1.0973684210526315, "frac_reward_zero_std": 0.34375, "grad_norm": 0.011968264356255531, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 138151143.0, "reward": 0.645214855670929, "reward_std": 0.16506317257881165, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.6328125, "rewards/symbolic_reward_accuracy/std": 0.48250964283943176, "rewards/symbolic_reward_partial_score/mean": 0.8850911855697632, "rewards/symbolic_reward_partial_score/std": 0.1865391731262207, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0785300731658936, "sampling/importance_sampling_ratio/min": 0.0006420343997888267, "sampling/sampling_logp_difference/max": 7.3508687019348145, "sampling/sampling_logp_difference/mean": 0.1492883265018463, "step": 417 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.30660849809646606, "epoch": 1.1, "grad_norm": 0.00733218202367425, "learning_rate": 1e-06, "loss": -0.0016, "step": 418 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3156313896179199, "epoch": 1.1026315789473684, "grad_norm": 0.008495202288031578, "learning_rate": 1e-06, "loss": 0.0032, "step": 419 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3160051256418228, "epoch": 1.1052631578947367, "grad_norm": 0.004335207864642143, "learning_rate": 1e-06, "loss": -0.0001, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 418.0078125, "completions/mean_terminated_length": 418.0078125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.3174561411142349, "epoch": 1.1078947368421053, "frac_reward_zero_std": 0.375, "grad_norm": 0.009130661375820637, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 138748619.0, "reward": 0.5871574282646179, "reward_std": 0.10197652876377106, "rewards/progression_diversity/mean": -8.063457789830863e-05, "rewards/progression_diversity/std": 0.0018245523096993566, "rewards/symbolic_reward_accuracy/mean": 0.552734375, "rewards/symbolic_reward_accuracy/std": 0.4976975917816162, "rewards/symbolic_reward_partial_score/mean": 0.8517252802848816, "rewards/symbolic_reward_partial_score/std": 0.1851339489221573, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0805712938308716, "sampling/importance_sampling_ratio/min": 1.1332810384567793e-10, "sampling/sampling_logp_difference/max": 22.900733947753906, "sampling/sampling_logp_difference/mean": 0.15244132280349731, "step": 421 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.31834374368190765, "epoch": 1.1105263157894736, "grad_norm": 0.00669381208717823, "learning_rate": 1e-06, "loss": -0.0022, "step": 422 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31199371814727783, "epoch": 1.1131578947368421, "grad_norm": 0.006369912531226873, "learning_rate": 1e-06, "loss": 0.0001, "step": 423 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31869344413280487, "epoch": 1.1157894736842104, "grad_norm": 0.009942461736500263, "learning_rate": 1e-06, "loss": 0.0021, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 424.611328125, "completions/mean_terminated_length": 424.611328125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.3098357766866684, "epoch": 1.118421052631579, "frac_reward_zero_std": 0.21875, "grad_norm": 0.01318982895463705, "learning_rate": 1e-06, "loss": -0.0019, "num_tokens": 139371172.0, "reward": 0.5094720125198364, "reward_std": 0.14669474959373474, "rewards/progression_diversity/mean": -6.337450759019703e-05, "rewards/progression_diversity/std": 0.0014340013731271029, "rewards/symbolic_reward_accuracy/mean": 0.4375, "rewards/symbolic_reward_accuracy/std": 0.49656352400779724, "rewards/symbolic_reward_partial_score/mean": 0.8238931894302368, "rewards/symbolic_reward_partial_score/std": 0.17723464965820312, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0803987979888916, "sampling/importance_sampling_ratio/min": 0.00013676800881512463, "sampling/sampling_logp_difference/max": 8.897224426269531, "sampling/sampling_logp_difference/mean": 0.15104670822620392, "step": 425 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.32109367847442627, "epoch": 1.1210526315789473, "grad_norm": 0.010008157230913639, "learning_rate": 1e-06, "loss": -0.0015, "step": 426 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.3111288249492645, "epoch": 1.1236842105263158, "grad_norm": 0.009902669116854668, "learning_rate": 1e-06, "loss": 0.0032, "step": 427 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3828125, "entropy": 0.3151933252811432, "epoch": 1.1263157894736842, "grad_norm": 0.006997853983193636, "learning_rate": 1e-06, "loss": 0.0004, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 411.9609375, "completions/mean_terminated_length": 411.9609375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.31707698106765747, "epoch": 1.1289473684210527, "frac_reward_zero_std": 0.4375, "grad_norm": 0.009430482983589172, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 139965712.0, "reward": 0.5824218988418579, "reward_std": 0.09653446823358536, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.5390625, "rewards/symbolic_reward_accuracy/std": 0.4989593029022217, "rewards/symbolic_reward_partial_score/mean": 0.86328125, "rewards/symbolic_reward_partial_score/std": 0.1686072200536728, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0779767036437988, "sampling/importance_sampling_ratio/min": 0.0005325896781869233, "sampling/sampling_logp_difference/max": 7.537759304046631, "sampling/sampling_logp_difference/mean": 0.1509750634431839, "step": 429 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.31451140344142914, "epoch": 1.131578947368421, "grad_norm": 0.006913443561643362, "learning_rate": 1e-06, "loss": 0.0036, "step": 430 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3125956207513809, "epoch": 1.1342105263157896, "grad_norm": 0.005642498843371868, "learning_rate": 1e-06, "loss": -0.0028, "step": 431 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.30931124091148376, "epoch": 1.1368421052631579, "grad_norm": 0.0037342640571296215, "learning_rate": 1e-06, "loss": -0.0011, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 416.34375, "completions/mean_terminated_length": 416.34375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.31373219192028046, "epoch": 1.1394736842105262, "frac_reward_zero_std": 0.3125, "grad_norm": 0.009242606349289417, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 140564160.0, "reward": 0.6995117664337158, "reward_std": 0.17368769645690918, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.712890625, "rewards/symbolic_reward_accuracy/std": 0.45285552740097046, "rewards/symbolic_reward_partial_score/mean": 0.9059244394302368, "rewards/symbolic_reward_partial_score/std": 0.18118035793304443, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078127384185791, "sampling/importance_sampling_ratio/min": 0.00010586700227577239, "sampling/sampling_logp_difference/max": 9.153326988220215, "sampling/sampling_logp_difference/mean": 0.1500268280506134, "step": 433 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3079152852296829, "epoch": 1.1421052631578947, "grad_norm": 0.010234912857413292, "learning_rate": 1e-06, "loss": 0.0002, "step": 434 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.31362414360046387, "epoch": 1.1447368421052633, "grad_norm": 0.009198206476867199, "learning_rate": 1e-06, "loss": 0.0004, "step": 435 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.313403844833374, "epoch": 1.1473684210526316, "grad_norm": 0.007888988591730595, "learning_rate": 1e-06, "loss": -0.0009, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 455.685546875, "completions/mean_terminated_length": 424.5146789550781, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.30758020281791687, "epoch": 1.15, "frac_reward_zero_std": 0.34375, "grad_norm": 0.012641419656574726, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 141187103.0, "reward": 0.5866211652755737, "reward_std": 0.12764082849025726, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.544921875, "rewards/symbolic_reward_accuracy/std": 0.4984649419784546, "rewards/symbolic_reward_partial_score/mean": 0.8662109375, "rewards/symbolic_reward_partial_score/std": 0.1738623082637787, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0769283771514893, "sampling/importance_sampling_ratio/min": 6.12766743870452e-05, "sampling/sampling_logp_difference/max": 9.700111389160156, "sampling/sampling_logp_difference/mean": 0.14595070481300354, "step": 437 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.31858476996421814, "epoch": 1.1526315789473685, "grad_norm": 0.006982941180467606, "learning_rate": 1e-06, "loss": -0.0004, "step": 438 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.30900435149669647, "epoch": 1.1552631578947368, "grad_norm": 0.005663620308041573, "learning_rate": 1e-06, "loss": -0.0011, "step": 439 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3072082996368408, "epoch": 1.1578947368421053, "grad_norm": 0.006159190554171801, "learning_rate": 1e-06, "loss": 0.0078, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 411.267578125, "completions/mean_terminated_length": 411.267578125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3167288303375244, "epoch": 1.1605263157894736, "frac_reward_zero_std": 0.3125, "grad_norm": 0.008326681330800056, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 141824520.0, "reward": 0.5193359851837158, "reward_std": 0.1523609459400177, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.44921875, "rewards/symbolic_reward_accuracy/std": 0.497901052236557, "rewards/symbolic_reward_partial_score/mean": 0.8326823115348816, "rewards/symbolic_reward_partial_score/std": 0.18424758315086365, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0789525508880615, "sampling/importance_sampling_ratio/min": 0.0006212457665242255, "sampling/sampling_logp_difference/max": 7.38378381729126, "sampling/sampling_logp_difference/mean": 0.15032032132148743, "step": 441 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3107949197292328, "epoch": 1.1631578947368422, "grad_norm": 0.009470357559621334, "learning_rate": 1e-06, "loss": -0.0001, "step": 442 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31048038601875305, "epoch": 1.1657894736842105, "grad_norm": 0.008164659142494202, "learning_rate": 1e-06, "loss": -0.0014, "step": 443 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.31193624436855316, "epoch": 1.168421052631579, "grad_norm": 0.006771499291062355, "learning_rate": 1e-06, "loss": 0.0025, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 454.470703125, "completions/mean_terminated_length": 423.2974548339844, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.3074225187301636, "epoch": 1.1710526315789473, "frac_reward_zero_std": 0.375, "grad_norm": 0.012199649587273598, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 142451673.0, "reward": 0.6461914777755737, "reward_std": 0.17145656049251556, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.638671875, "rewards/symbolic_reward_accuracy/std": 0.48085519671440125, "rewards/symbolic_reward_partial_score/mean": 0.8772786259651184, "rewards/symbolic_reward_partial_score/std": 0.19054405391216278, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0755971670150757, "sampling/importance_sampling_ratio/min": 0.0009448478813283145, "sampling/sampling_logp_difference/max": 6.964486598968506, "sampling/sampling_logp_difference/mean": 0.14415866136550903, "step": 445 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3090481609106064, "epoch": 1.1736842105263159, "grad_norm": 0.010675223544239998, "learning_rate": 1e-06, "loss": 0.0009, "step": 446 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2992829978466034, "epoch": 1.1763157894736842, "grad_norm": 0.009602191857993603, "learning_rate": 1e-06, "loss": 0.0072, "step": 447 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3019806146621704, "epoch": 1.1789473684210527, "grad_norm": 0.0074586388655006886, "learning_rate": 1e-06, "loss": -0.0011, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1698.0, "completions/max_terminated_length": 1698.0, "completions/mean_length": 416.384765625, "completions/mean_terminated_length": 416.384765625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3051687926054001, "epoch": 1.181578947368421, "frac_reward_zero_std": 0.28125, "grad_norm": 0.009870451875030994, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 143072158.0, "reward": 0.6162108182907104, "reward_std": 0.16109126806259155, "rewards/progression_diversity/mean": -1.2670810974668711e-05, "rewards/progression_diversity/std": 0.0002688794629648328, "rewards/symbolic_reward_accuracy/mean": 0.59375, "rewards/symbolic_reward_accuracy/std": 0.49161264300346375, "rewards/symbolic_reward_partial_score/mean": 0.8665364384651184, "rewards/symbolic_reward_partial_score/std": 0.20193934440612793, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0762814283370972, "sampling/importance_sampling_ratio/min": 0.00018196065502706915, "sampling/sampling_logp_difference/max": 8.611720085144043, "sampling/sampling_logp_difference/mean": 0.14643414318561554, "step": 449 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.31105637550354004, "epoch": 1.1842105263157894, "grad_norm": 0.007723489310592413, "learning_rate": 1e-06, "loss": 0.0014, "step": 450 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30340468883514404, "epoch": 1.186842105263158, "grad_norm": 0.007095393259078264, "learning_rate": 1e-06, "loss": 0.0006, "step": 451 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.30283382534980774, "epoch": 1.1894736842105262, "grad_norm": 0.003884904785081744, "learning_rate": 1e-06, "loss": -0.0001, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 421.1328125, "completions/mean_terminated_length": 421.1328125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.3055991977453232, "epoch": 1.1921052631578948, "frac_reward_zero_std": 0.21875, "grad_norm": 0.00943301897495985, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 143696098.0, "reward": 0.5283684730529785, "reward_std": 0.1500403881072998, "rewards/progression_diversity/mean": -6.903627945575863e-05, "rewards/progression_diversity/std": 0.001343548996374011, "rewards/symbolic_reward_accuracy/mean": 0.462890625, "rewards/symbolic_reward_accuracy/std": 0.4991086423397064, "rewards/symbolic_reward_partial_score/mean": 0.83544921875, "rewards/symbolic_reward_partial_score/std": 0.181224524974823, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0770254135131836, "sampling/importance_sampling_ratio/min": 0.0003639268397819251, "sampling/sampling_logp_difference/max": 7.918557643890381, "sampling/sampling_logp_difference/mean": 0.14773014187812805, "step": 453 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.359375, "entropy": 0.30224618315696716, "epoch": 1.194736842105263, "grad_norm": 0.010618505999445915, "learning_rate": 1e-06, "loss": 0.0026, "step": 454 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.31181102991104126, "epoch": 1.1973684210526316, "grad_norm": 0.008554063737392426, "learning_rate": 1e-06, "loss": -0.0029, "step": 455 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.25, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.40625, "entropy": 0.31251056492328644, "epoch": 1.2, "grad_norm": 0.00930565595626831, "learning_rate": 1e-06, "loss": 0.0019, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 411.087890625, "completions/mean_terminated_length": 411.087890625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.3041108399629593, "epoch": 1.2026315789473685, "frac_reward_zero_std": 0.3125, "grad_norm": 0.010307530872523785, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 144323215.0, "reward": 0.6019531488418579, "reward_std": 0.15361054241657257, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.5703125, "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, "rewards/symbolic_reward_partial_score/mean": 0.8658853769302368, "rewards/symbolic_reward_partial_score/std": 0.1798396110534668, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0760828256607056, "sampling/importance_sampling_ratio/min": 0.00027435572701506317, "sampling/sampling_logp_difference/max": 8.201085090637207, "sampling/sampling_logp_difference/mean": 0.14550653100013733, "step": 457 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.30855709314346313, "epoch": 1.2052631578947368, "grad_norm": 0.006558986846357584, "learning_rate": 1e-06, "loss": 0.0005, "step": 458 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.30384236574172974, "epoch": 1.2078947368421054, "grad_norm": 0.005990568548440933, "learning_rate": 1e-06, "loss": 0.0018, "step": 459 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.30504827201366425, "epoch": 1.2105263157894737, "grad_norm": 0.011308073066174984, "learning_rate": 1e-06, "loss": -0.0021, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 413.00390625, "completions/mean_terminated_length": 413.00390625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3013734370470047, "epoch": 1.2131578947368422, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008770092390477657, "learning_rate": 1e-06, "loss": 0.0024, "num_tokens": 144952657.0, "reward": 0.5407714247703552, "reward_std": 0.12260206788778305, "rewards/progression_diversity/mean": -6.970949470996857e-06, "rewards/progression_diversity/std": 0.00015773458289913833, "rewards/symbolic_reward_accuracy/mean": 0.4921875, "rewards/symbolic_reward_accuracy/std": 0.5004279017448425, "rewards/symbolic_reward_partial_score/mean": 0.8181966543197632, "rewards/symbolic_reward_partial_score/std": 0.21273426711559296, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076730728149414, "sampling/importance_sampling_ratio/min": 7.727334013907239e-05, "sampling/sampling_logp_difference/max": 9.468161582946777, "sampling/sampling_logp_difference/mean": 0.14597457647323608, "step": 461 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.30145081877708435, "epoch": 1.2157894736842105, "grad_norm": 0.005171489901840687, "learning_rate": 1e-06, "loss": -0.0021, "step": 462 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3117906451225281, "epoch": 1.2184210526315788, "grad_norm": 0.00654226541519165, "learning_rate": 1e-06, "loss": 0.0038, "step": 463 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3086640387773514, "epoch": 1.2210526315789474, "grad_norm": 0.007939969189465046, "learning_rate": 1e-06, "loss": -0.0043, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 445.212890625, "completions/mean_terminated_length": 414.0215148925781, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.31280216574668884, "epoch": 1.2236842105263157, "frac_reward_zero_std": 0.25, "grad_norm": 0.009629838168621063, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 145584574.0, "reward": 0.6421382427215576, "reward_std": 0.18600882589817047, "rewards/progression_diversity/mean": -3.9178747101686895e-05, "rewards/progression_diversity/std": 0.000886513851583004, "rewards/symbolic_reward_accuracy/mean": 0.626953125, "rewards/symbolic_reward_accuracy/std": 0.48408737778663635, "rewards/symbolic_reward_partial_score/mean": 0.88720703125, "rewards/symbolic_reward_partial_score/std": 0.17482532560825348, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0746833086013794, "sampling/importance_sampling_ratio/min": 0.0003272001340519637, "sampling/sampling_logp_difference/max": 8.024938583374023, "sampling/sampling_logp_difference/mean": 0.13991132378578186, "step": 465 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.2979233115911484, "epoch": 1.2263157894736842, "grad_norm": 0.009606428444385529, "learning_rate": 1e-06, "loss": 0.0284, "step": 466 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.3038392812013626, "epoch": 1.2289473684210526, "grad_norm": 0.006687372922897339, "learning_rate": 1e-06, "loss": -0.0004, "step": 467 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.30388760566711426, "epoch": 1.231578947368421, "grad_norm": 0.010134851559996605, "learning_rate": 1e-06, "loss": 0.001, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 409.5859375, "completions/mean_terminated_length": 409.5859375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3190210312604904, "epoch": 1.2342105263157894, "frac_reward_zero_std": 0.125, "grad_norm": 0.010196955874562263, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 146227466.0, "reward": 0.5007807016372681, "reward_std": 0.19226862490177155, "rewards/progression_diversity/mean": -5.4633375839330256e-05, "rewards/progression_diversity/std": 0.0007276704418472946, "rewards/symbolic_reward_accuracy/mean": 0.4296875, "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, "rewards/symbolic_reward_partial_score/mean": 0.8098958730697632, "rewards/symbolic_reward_partial_score/std": 0.19706134498119354, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0778709650039673, "sampling/importance_sampling_ratio/min": 0.001399537082761526, "sampling/sampling_logp_difference/max": 6.571613788604736, "sampling/sampling_logp_difference/mean": 0.14782477915287018, "step": 469 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.2109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3139819949865341, "epoch": 1.236842105263158, "grad_norm": 0.007627170532941818, "learning_rate": 1e-06, "loss": 0.0021, "step": 470 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1796875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.30873872339725494, "epoch": 1.2394736842105263, "grad_norm": 0.006808551959693432, "learning_rate": 1e-06, "loss": 0.0016, "step": 471 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.4453125, "entropy": 0.3088305741548538, "epoch": 1.2421052631578948, "grad_norm": 0.011055199429392815, "learning_rate": 1e-06, "loss": 0.0007, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 410.4921875, "completions/mean_terminated_length": 410.4921875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.31317228078842163, "epoch": 1.2447368421052631, "frac_reward_zero_std": 0.375, "grad_norm": 0.009969279170036316, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 146844454.0, "reward": 0.566699206829071, "reward_std": 0.13038089871406555, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.513671875, "rewards/symbolic_reward_accuracy/std": 0.5003018379211426, "rewards/symbolic_reward_partial_score/mean": 0.8616536259651184, "rewards/symbolic_reward_partial_score/std": 0.16637560725212097, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0773735046386719, "sampling/importance_sampling_ratio/min": 2.6251354938722216e-06, "sampling/sampling_logp_difference/max": 12.850378036499023, "sampling/sampling_logp_difference/mean": 0.1465354859828949, "step": 473 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.28125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3984375, "entropy": 0.31050053238868713, "epoch": 1.2473684210526317, "grad_norm": 0.005133951548486948, "learning_rate": 1e-06, "loss": -0.0001, "step": 474 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3095279932022095, "epoch": 1.25, "grad_norm": 0.009616038762032986, "learning_rate": 1e-06, "loss": 0.0009, "step": 475 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31143562495708466, "epoch": 1.2526315789473683, "grad_norm": 0.008402734994888306, "learning_rate": 1e-06, "loss": 0.0029, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 410.126953125, "completions/mean_terminated_length": 410.126953125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.31026244163513184, "epoch": 1.2552631578947369, "frac_reward_zero_std": 0.4375, "grad_norm": 0.00997492577880621, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 147446215.0, "reward": 0.6174317002296448, "reward_std": 0.0901901125907898, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.59765625, "rewards/symbolic_reward_accuracy/std": 0.4908501207828522, "rewards/symbolic_reward_partial_score/mean": 0.8634440302848816, "rewards/symbolic_reward_partial_score/std": 0.19049188494682312, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0762457847595215, "sampling/importance_sampling_ratio/min": 2.626519199111499e-05, "sampling/sampling_logp_difference/max": 10.547266006469727, "sampling/sampling_logp_difference/mean": 0.14433935284614563, "step": 477 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.30593959987163544, "epoch": 1.2578947368421054, "grad_norm": 0.007015667390078306, "learning_rate": 1e-06, "loss": 0.0002, "step": 478 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.30408360064029694, "epoch": 1.2605263157894737, "grad_norm": 0.005526754539459944, "learning_rate": 1e-06, "loss": 0.0006, "step": 479 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3031175583600998, "epoch": 1.263157894736842, "grad_norm": 0.006289470940828323, "learning_rate": 1e-06, "loss": -0.0017, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 407.314453125, "completions/mean_terminated_length": 407.314453125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3031823933124542, "epoch": 1.2657894736842106, "frac_reward_zero_std": 0.46875, "grad_norm": 0.011879026889801025, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 148058728.0, "reward": 0.547900378704071, "reward_std": 0.1337689757347107, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.490234375, "rewards/symbolic_reward_accuracy/std": 0.5003935098648071, "rewards/symbolic_reward_partial_score/mean": 0.8458659052848816, "rewards/symbolic_reward_partial_score/std": 0.1819261908531189, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0763696432113647, "sampling/importance_sampling_ratio/min": 0.0010056419996544719, "sampling/sampling_logp_difference/max": 6.902129173278809, "sampling/sampling_logp_difference/mean": 0.1454847753047943, "step": 481 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31013526022434235, "epoch": 1.268421052631579, "grad_norm": 0.005811288487166166, "learning_rate": 1e-06, "loss": 0.0012, "step": 482 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.30912819504737854, "epoch": 1.2710526315789474, "grad_norm": 0.006225419230759144, "learning_rate": 1e-06, "loss": 0.0035, "step": 483 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.30452604591846466, "epoch": 1.2736842105263158, "grad_norm": 0.007433480583131313, "learning_rate": 1e-06, "loss": -0.0034, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 418.41796875, "completions/mean_terminated_length": 418.41796875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.30341316759586334, "epoch": 1.2763157894736843, "frac_reward_zero_std": 0.4375, "grad_norm": 0.011153457686305046, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 148655230.0, "reward": 0.686572253704071, "reward_std": 0.11991460621356964, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.705078125, "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, "rewards/symbolic_reward_partial_score/mean": 0.87841796875, "rewards/symbolic_reward_partial_score/std": 0.21000272035598755, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075927972793579, "sampling/importance_sampling_ratio/min": 0.0007375699933618307, "sampling/sampling_logp_difference/max": 7.212149620056152, "sampling/sampling_logp_difference/mean": 0.14460578560829163, "step": 485 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2982192039489746, "epoch": 1.2789473684210526, "grad_norm": 0.005498748738318682, "learning_rate": 1e-06, "loss": -0.0034, "step": 486 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.30715058743953705, "epoch": 1.2815789473684212, "grad_norm": 0.00860719196498394, "learning_rate": 1e-06, "loss": 0.0018, "step": 487 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.30552174150943756, "epoch": 1.2842105263157895, "grad_norm": 0.007835413329303265, "learning_rate": 1e-06, "loss": 0.0026, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 403.826171875, "completions/mean_terminated_length": 403.826171875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3113265782594681, "epoch": 1.2868421052631578, "frac_reward_zero_std": 0.40625, "grad_norm": 0.010389966890215874, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 149262469.0, "reward": 0.5675293207168579, "reward_std": 0.10506439208984375, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.515625, "rewards/symbolic_reward_accuracy/std": 0.5002445578575134, "rewards/symbolic_reward_partial_score/mean": 0.8605142831802368, "rewards/symbolic_reward_partial_score/std": 0.16847233474254608, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07625412940979, "sampling/importance_sampling_ratio/min": 0.002913564909249544, "sampling/sampling_logp_difference/max": 5.838377952575684, "sampling/sampling_logp_difference/mean": 0.14658384025096893, "step": 489 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.30906206369400024, "epoch": 1.2894736842105263, "grad_norm": 0.006328464951366186, "learning_rate": 1e-06, "loss": -0.0022, "step": 490 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.30755680799484253, "epoch": 1.2921052631578949, "grad_norm": 0.009180142544209957, "learning_rate": 1e-06, "loss": 0.0017, "step": 491 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3033202886581421, "epoch": 1.2947368421052632, "grad_norm": 0.0056364513002336025, "learning_rate": 1e-06, "loss": -0.0002, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 405.30078125, "completions/mean_terminated_length": 405.30078125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.30497485399246216, "epoch": 1.2973684210526315, "frac_reward_zero_std": 0.375, "grad_norm": 0.008342030458152294, "learning_rate": 1e-06, "loss": -0.0025, "num_tokens": 149876959.0, "reward": 0.6874990463256836, "reward_std": 0.1433926224708557, "rewards/progression_diversity/mean": -0.00010337447747588158, "rewards/progression_diversity/std": 0.0016614391934126616, "rewards/symbolic_reward_accuracy/mean": 0.69140625, "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, "rewards/symbolic_reward_partial_score/mean": 0.9088541865348816, "rewards/symbolic_reward_partial_score/std": 0.14890244603157043, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0752720832824707, "sampling/importance_sampling_ratio/min": 0.0008119558915495872, "sampling/sampling_logp_difference/max": 7.116064548492432, "sampling/sampling_logp_difference/mean": 0.14508485794067383, "step": 493 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3033030182123184, "epoch": 1.3, "grad_norm": 0.004780566319823265, "learning_rate": 1e-06, "loss": -0.0005, "step": 494 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30343642830848694, "epoch": 1.3026315789473684, "grad_norm": 0.006665179040282965, "learning_rate": 1e-06, "loss": 0.0025, "step": 495 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3080981373786926, "epoch": 1.305263157894737, "grad_norm": 0.00797088909894228, "learning_rate": 1e-06, "loss": 0.0014, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 407.87890625, "completions/mean_terminated_length": 407.87890625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.30483171343803406, "epoch": 1.3078947368421052, "frac_reward_zero_std": 0.28125, "grad_norm": 0.010050629265606403, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 150488097.0, "reward": 0.545116662979126, "reward_std": 0.13066868484020233, "rewards/progression_diversity/mean": -5.45380862604361e-05, "rewards/progression_diversity/std": 0.0008203862817026675, "rewards/symbolic_reward_accuracy/mean": 0.494140625, "rewards/symbolic_reward_accuracy/std": 0.5004546642303467, "rewards/symbolic_reward_partial_score/mean": 0.8287760615348816, "rewards/symbolic_reward_partial_score/std": 0.19000308215618134, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0742123126983643, "sampling/importance_sampling_ratio/min": 9.160504851024598e-05, "sampling/sampling_logp_difference/max": 9.29802417755127, "sampling/sampling_logp_difference/mean": 0.14309212565422058, "step": 497 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.296875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.29966841638088226, "epoch": 1.3105263157894738, "grad_norm": 0.004824103321880102, "learning_rate": 1e-06, "loss": -0.002, "step": 498 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.30102767050266266, "epoch": 1.313157894736842, "grad_norm": 0.007378766778856516, "learning_rate": 1e-06, "loss": 0.0001, "step": 499 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.30113108456134796, "epoch": 1.3157894736842106, "grad_norm": 0.009306096471846104, "learning_rate": 1e-06, "loss": -0.0011, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 399.66796875, "completions/mean_terminated_length": 399.66796875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.30337396264076233, "epoch": 1.318421052631579, "frac_reward_zero_std": 0.28125, "grad_norm": 0.011074943467974663, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 151096535.0, "reward": 0.6620602011680603, "reward_std": 0.17205195128917694, "rewards/progression_diversity/mean": -3.9536142139695585e-05, "rewards/progression_diversity/std": 0.0008946007583290339, "rewards/symbolic_reward_accuracy/mean": 0.6640625, "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, "rewards/symbolic_reward_partial_score/mean": 0.8787435293197632, "rewards/symbolic_reward_partial_score/std": 0.19655956327915192, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.075458288192749, "sampling/importance_sampling_ratio/min": 3.9075985114322975e-05, "sampling/sampling_logp_difference/max": 10.150002479553223, "sampling/sampling_logp_difference/mean": 0.14344847202301025, "step": 501 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3023105710744858, "epoch": 1.3210526315789473, "grad_norm": 0.006497818976640701, "learning_rate": 1e-06, "loss": -0.0023, "step": 502 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.30219969153404236, "epoch": 1.3236842105263158, "grad_norm": 0.004322134889662266, "learning_rate": 1e-06, "loss": -0.0024, "step": 503 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.3050485998392105, "epoch": 1.3263157894736843, "grad_norm": 0.008752789348363876, "learning_rate": 1e-06, "loss": 0.006, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 402.416015625, "completions/mean_terminated_length": 402.416015625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.30552002787590027, "epoch": 1.3289473684210527, "frac_reward_zero_std": 0.5, "grad_norm": 0.00773452315479517, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 151739244.0, "reward": 0.596386730670929, "reward_std": 0.1162668839097023, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.564453125, "rewards/symbolic_reward_accuracy/std": 0.49631330370903015, "rewards/symbolic_reward_partial_score/mean": 0.8590494394302368, "rewards/symbolic_reward_partial_score/std": 0.1787545084953308, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077009916305542, "sampling/importance_sampling_ratio/min": 4.0180704672820866e-05, "sampling/sampling_logp_difference/max": 10.122123718261719, "sampling/sampling_logp_difference/mean": 0.1449548900127411, "step": 505 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3068659007549286, "epoch": 1.331578947368421, "grad_norm": 0.004809284582734108, "learning_rate": 1e-06, "loss": -0.0015, "step": 506 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.30974075198173523, "epoch": 1.3342105263157895, "grad_norm": 0.008824328891932964, "learning_rate": 1e-06, "loss": 0.0025, "step": 507 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.30546213686466217, "epoch": 1.3368421052631578, "grad_norm": 0.007738140411674976, "learning_rate": 1e-06, "loss": 0.0023, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 403.423828125, "completions/mean_terminated_length": 403.423828125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3024378567934036, "epoch": 1.3394736842105264, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008510293439030647, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 152346597.0, "reward": 0.6561523675918579, "reward_std": 0.13507352769374847, "rewards/progression_diversity/mean": -1.180451363325119e-06, "rewards/progression_diversity/std": 2.6710564270615578e-05, "rewards/symbolic_reward_accuracy/mean": 0.654296875, "rewards/symbolic_reward_accuracy/std": 0.4760620892047882, "rewards/symbolic_reward_partial_score/mean": 0.8785806894302368, "rewards/symbolic_reward_partial_score/std": 0.19002403318881989, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0751402378082275, "sampling/importance_sampling_ratio/min": 4.670919224736281e-06, "sampling/sampling_logp_difference/max": 12.274154663085938, "sampling/sampling_logp_difference/mean": 0.14365223050117493, "step": 509 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3060772866010666, "epoch": 1.3421052631578947, "grad_norm": 0.006542941089719534, "learning_rate": 1e-06, "loss": 0.0011, "step": 510 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30383458733558655, "epoch": 1.3447368421052632, "grad_norm": 0.005349365528672934, "learning_rate": 1e-06, "loss": -0.0003, "step": 511 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.3082137107849121, "epoch": 1.3473684210526315, "grad_norm": 0.009062383323907852, "learning_rate": 1e-06, "loss": 0.0012, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 409.34375, "completions/mean_terminated_length": 409.34375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "entropy": 0.3136248141527176, "epoch": 1.35, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008149274624884129, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 152938453.0, "reward": 0.6833982467651367, "reward_std": 0.1238265112042427, "rewards/progression_diversity/mean": -2.053233765764162e-05, "rewards/progression_diversity/std": 0.0004645937879104167, "rewards/symbolic_reward_accuracy/mean": 0.689453125, "rewards/symbolic_reward_accuracy/std": 0.46317005157470703, "rewards/symbolic_reward_partial_score/mean": 0.8990885019302368, "rewards/symbolic_reward_partial_score/std": 0.18109507858753204, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0766186714172363, "sampling/importance_sampling_ratio/min": 1.2683102795563173e-05, "sampling/sampling_logp_difference/max": 11.275239944458008, "sampling/sampling_logp_difference/mean": 0.14412665367126465, "step": 513 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3056807965040207, "epoch": 1.3526315789473684, "grad_norm": 0.003623334923759103, "learning_rate": 1e-06, "loss": 0.0003, "step": 514 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.30095237493515015, "epoch": 1.3552631578947367, "grad_norm": 0.0060382746160030365, "learning_rate": 1e-06, "loss": 0.0019, "step": 515 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3073163330554962, "epoch": 1.3578947368421053, "grad_norm": 0.008139624260365963, "learning_rate": 1e-06, "loss": -0.0012, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 396.921875, "completions/mean_terminated_length": 396.921875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.3063454031944275, "epoch": 1.3605263157894738, "frac_reward_zero_std": 0.34375, "grad_norm": 0.01082697045058012, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 153535981.0, "reward": 0.5885741710662842, "reward_std": 0.13205640017986298, "rewards/progression_diversity/mean": -1.0338511856389232e-05, "rewards/progression_diversity/std": 0.00017001591913867742, "rewards/symbolic_reward_accuracy/mean": 0.556640625, "rewards/symbolic_reward_accuracy/std": 0.49726733565330505, "rewards/symbolic_reward_partial_score/mean": 0.8486328125, "rewards/symbolic_reward_partial_score/std": 0.20038633048534393, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0767133235931396, "sampling/importance_sampling_ratio/min": 0.0021863149013370275, "sampling/sampling_logp_difference/max": 6.125537872314453, "sampling/sampling_logp_difference/mean": 0.1458137333393097, "step": 517 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3054617643356323, "epoch": 1.3631578947368421, "grad_norm": 0.006087815389037132, "learning_rate": 1e-06, "loss": -0.0007, "step": 518 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3359375, "entropy": 0.309252068400383, "epoch": 1.3657894736842104, "grad_norm": 0.0069716330617666245, "learning_rate": 1e-06, "loss": 0.0027, "step": 519 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.31125877797603607, "epoch": 1.368421052631579, "grad_norm": 0.006700367201119661, "learning_rate": 1e-06, "loss": -0.0025, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 394.095703125, "completions/mean_terminated_length": 394.095703125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.30801936984062195, "epoch": 1.3710526315789473, "frac_reward_zero_std": 0.34375, "grad_norm": 0.007649766281247139, "learning_rate": 1e-06, "loss": -0.0021, "num_tokens": 154135390.0, "reward": 0.5628417730331421, "reward_std": 0.1588105857372284, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.517578125, "rewards/symbolic_reward_accuracy/std": 0.5001795887947083, "rewards/symbolic_reward_partial_score/mean": 0.8416340947151184, "rewards/symbolic_reward_partial_score/std": 0.21291983127593994, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0771160125732422, "sampling/importance_sampling_ratio/min": 8.427595275861677e-06, "sampling/sampling_logp_difference/max": 11.683999061584473, "sampling/sampling_logp_difference/mean": 0.14557112753391266, "step": 521 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.30574391782283783, "epoch": 1.3736842105263158, "grad_norm": 0.006504260469228029, "learning_rate": 1e-06, "loss": -0.0017, "step": 522 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.31414419412612915, "epoch": 1.3763157894736842, "grad_norm": 0.004781621042639017, "learning_rate": 1e-06, "loss": 0.0009, "step": 523 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3671875, "entropy": 0.3113695830106735, "epoch": 1.3789473684210527, "grad_norm": 0.00859212689101696, "learning_rate": 1e-06, "loss": 0.0019, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 424.533203125, "completions/mean_terminated_length": 393.3013610839844, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3044796884059906, "epoch": 1.381578947368421, "frac_reward_zero_std": 0.375, "grad_norm": 0.008099560625851154, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 154777071.0, "reward": 0.6270015239715576, "reward_std": 0.1343921273946762, "rewards/progression_diversity/mean": -4.158555020694621e-05, "rewards/progression_diversity/std": 0.0009409735794179142, "rewards/symbolic_reward_accuracy/mean": 0.609375, "rewards/symbolic_reward_accuracy/std": 0.48836761713027954, "rewards/symbolic_reward_partial_score/mean": 0.8719075918197632, "rewards/symbolic_reward_partial_score/std": 0.1844439059495926, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0758286714553833, "sampling/importance_sampling_ratio/min": 5.585740655078553e-05, "sampling/sampling_logp_difference/max": 9.792708396911621, "sampling/sampling_logp_difference/mean": 0.1419658362865448, "step": 525 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3106999695301056, "epoch": 1.3842105263157896, "grad_norm": 0.008930054493248463, "learning_rate": 1e-06, "loss": 0.0297, "step": 526 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3158520311117172, "epoch": 1.3868421052631579, "grad_norm": 0.005683277267962694, "learning_rate": 1e-06, "loss": 0.0005, "step": 527 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3125, "entropy": 0.30958664417266846, "epoch": 1.3894736842105262, "grad_norm": 0.006956308148801327, "learning_rate": 1e-06, "loss": 0.0003, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1008.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 396.740234375, "completions/mean_terminated_length": 396.740234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.3058925271034241, "epoch": 1.3921052631578947, "frac_reward_zero_std": 0.46875, "grad_norm": 0.009263314306735992, "learning_rate": 1e-06, "loss": 0.003, "num_tokens": 155396042.0, "reward": 0.6233398914337158, "reward_std": 0.1278683841228485, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.607421875, "rewards/symbolic_reward_accuracy/std": 0.4888018071651459, "rewards/symbolic_reward_partial_score/mean": 0.8629557490348816, "rewards/symbolic_reward_partial_score/std": 0.19650208950042725, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0787575244903564, "sampling/importance_sampling_ratio/min": 0.003360233036801219, "sampling/sampling_logp_difference/max": 5.69574499130249, "sampling/sampling_logp_difference/mean": 0.14666804671287537, "step": 529 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3156265914440155, "epoch": 1.3947368421052633, "grad_norm": 0.005127849522978067, "learning_rate": 1e-06, "loss": -0.0007, "step": 530 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31428514420986176, "epoch": 1.3973684210526316, "grad_norm": 0.005728641990572214, "learning_rate": 1e-06, "loss": -0.0002, "step": 531 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.31455811858177185, "epoch": 1.4, "grad_norm": 0.00949255283921957, "learning_rate": 1e-06, "loss": -0.0013, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 791.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 380.107421875, "completions/mean_terminated_length": 380.107421875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.31324976682662964, "epoch": 1.4026315789473685, "frac_reward_zero_std": 0.4375, "grad_norm": 0.007790201343595982, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 156002465.0, "reward": 0.6195793151855469, "reward_std": 0.1195153146982193, "rewards/progression_diversity/mean": -8.090149640338495e-05, "rewards/progression_diversity/std": 0.001830591820180416, "rewards/symbolic_reward_accuracy/mean": 0.59375, "rewards/symbolic_reward_accuracy/std": 0.49161264300346375, "rewards/symbolic_reward_partial_score/mean": 0.8777669072151184, "rewards/symbolic_reward_partial_score/std": 0.17154613137245178, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078112006187439, "sampling/importance_sampling_ratio/min": 3.216441427866812e-06, "sampling/sampling_logp_difference/max": 12.647234916687012, "sampling/sampling_logp_difference/mean": 0.14644742012023926, "step": 533 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.31220632791519165, "epoch": 1.4052631578947368, "grad_norm": 0.006627189461141825, "learning_rate": 1e-06, "loss": 0.0002, "step": 534 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.313899427652359, "epoch": 1.4078947368421053, "grad_norm": 0.0068327574990689754, "learning_rate": 1e-06, "loss": -0.0014, "step": 535 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3095439076423645, "epoch": 1.4105263157894736, "grad_norm": 0.006293828133493662, "learning_rate": 1e-06, "loss": 0.0006, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 384.59375, "completions/mean_terminated_length": 384.59375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.310128778219223, "epoch": 1.4131578947368422, "frac_reward_zero_std": 0.5625, "grad_norm": 0.008397966623306274, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 156591153.0, "reward": 0.6722656488418579, "reward_std": 0.13169080018997192, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.67578125, "rewards/symbolic_reward_accuracy/std": 0.4685399830341339, "rewards/symbolic_reward_partial_score/mean": 0.8893228769302368, "rewards/symbolic_reward_partial_score/std": 0.18464936316013336, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768916606903076, "sampling/importance_sampling_ratio/min": 0.00040862703463062644, "sampling/sampling_logp_difference/max": 7.802707672119141, "sampling/sampling_logp_difference/mean": 0.1446777880191803, "step": 537 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30385565757751465, "epoch": 1.4157894736842105, "grad_norm": 0.00654578348621726, "learning_rate": 1e-06, "loss": -0.0003, "step": 538 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.30842697620391846, "epoch": 1.418421052631579, "grad_norm": 0.005931271240115166, "learning_rate": 1e-06, "loss": -0.0015, "step": 539 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.307214692234993, "epoch": 1.4210526315789473, "grad_norm": 0.004985845647752285, "learning_rate": 1e-06, "loss": 0.0002, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 385.11328125, "completions/mean_terminated_length": 385.11328125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.31230244040489197, "epoch": 1.4236842105263157, "frac_reward_zero_std": 0.40625, "grad_norm": 0.007098773028701544, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 157183115.0, "reward": 0.67724609375, "reward_std": 0.11101227253675461, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.685546875, "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, "rewards/symbolic_reward_partial_score/mean": 0.8863932490348816, "rewards/symbolic_reward_partial_score/std": 0.19459529221057892, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0779294967651367, "sampling/importance_sampling_ratio/min": 0.0003273308975622058, "sampling/sampling_logp_difference/max": 8.02453899383545, "sampling/sampling_logp_difference/mean": 0.1470160037279129, "step": 541 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31138667464256287, "epoch": 1.4263157894736842, "grad_norm": 0.006334090139716864, "learning_rate": 1e-06, "loss": 0.0023, "step": 542 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3078685700893402, "epoch": 1.4289473684210527, "grad_norm": 0.009027614258229733, "learning_rate": 1e-06, "loss": -0.0005, "step": 543 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.30706819891929626, "epoch": 1.431578947368421, "grad_norm": 0.006638620514422655, "learning_rate": 1e-06, "loss": -0.0019, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 388.3984375, "completions/mean_terminated_length": 388.3984375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3106583505868912, "epoch": 1.4342105263157894, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008243853226304054, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 157760919.0, "reward": 0.735595703125, "reward_std": 0.1595648229122162, "rewards/progression_diversity/mean": -4.20508786191931e-06, "rewards/progression_diversity/std": 9.51502806856297e-05, "rewards/symbolic_reward_accuracy/mean": 0.76171875, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.9285481572151184, "rewards/symbolic_reward_partial_score/std": 0.14488224685192108, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078289270401001, "sampling/importance_sampling_ratio/min": 0.00010619553358992562, "sampling/sampling_logp_difference/max": 9.150228500366211, "sampling/sampling_logp_difference/mean": 0.14683890342712402, "step": 545 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31235410273075104, "epoch": 1.436842105263158, "grad_norm": 0.005676098167896271, "learning_rate": 1e-06, "loss": -0.0025, "step": 546 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.30875393748283386, "epoch": 1.4394736842105262, "grad_norm": 0.00652680266648531, "learning_rate": 1e-06, "loss": 0.0012, "step": 547 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3143164962530136, "epoch": 1.4421052631578948, "grad_norm": 0.006441683974117041, "learning_rate": 1e-06, "loss": 0.0036, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 388.33203125, "completions/mean_terminated_length": 388.33203125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.31009891629219055, "epoch": 1.444736842105263, "frac_reward_zero_std": 0.25, "grad_norm": 0.009880034253001213, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 158389761.0, "reward": 0.5567857027053833, "reward_std": 0.15986129641532898, "rewards/progression_diversity/mean": -0.00014837765775155276, "rewards/progression_diversity/std": 0.0023622072767466307, "rewards/symbolic_reward_accuracy/mean": 0.51171875, "rewards/symbolic_reward_accuracy/std": 0.5003514885902405, "rewards/symbolic_reward_partial_score/mean": 0.83251953125, "rewards/symbolic_reward_partial_score/std": 0.1981428563594818, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077824354171753, "sampling/importance_sampling_ratio/min": 0.0002361713268328458, "sampling/sampling_logp_difference/max": 8.350953102111816, "sampling/sampling_logp_difference/mean": 0.14743688702583313, "step": 549 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.31600329279899597, "epoch": 1.4473684210526316, "grad_norm": 0.007339461240917444, "learning_rate": 1e-06, "loss": -0.0014, "step": 550 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3122548907995224, "epoch": 1.45, "grad_norm": 0.006188119761645794, "learning_rate": 1e-06, "loss": -0.0011, "step": 551 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.453125, "entropy": 0.3066437244415283, "epoch": 1.4526315789473685, "grad_norm": 0.004552071448415518, "learning_rate": 1e-06, "loss": 0.0039, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 384.88671875, "completions/mean_terminated_length": 384.88671875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.3084342032670975, "epoch": 1.4552631578947368, "frac_reward_zero_std": 0.59375, "grad_norm": 0.005656357388943434, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 158984455.0, "reward": 0.7021973133087158, "reward_std": 0.09900879859924316, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.71484375, "rewards/symbolic_reward_accuracy/std": 0.45193037390708923, "rewards/symbolic_reward_partial_score/mean": 0.9109700918197632, "rewards/symbolic_reward_partial_score/std": 0.15867188572883606, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0765399932861328, "sampling/importance_sampling_ratio/min": 6.768944876966998e-05, "sampling/sampling_logp_difference/max": 9.600580215454102, "sampling/sampling_logp_difference/mean": 0.14523030817508698, "step": 553 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.31066471338272095, "epoch": 1.4578947368421051, "grad_norm": 0.0047269840724766254, "learning_rate": 1e-06, "loss": 0.0003, "step": 554 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3113812804222107, "epoch": 1.4605263157894737, "grad_norm": 0.00564876152202487, "learning_rate": 1e-06, "loss": 0.002, "step": 555 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3069649040699005, "epoch": 1.4631578947368422, "grad_norm": 0.008440033532679081, "learning_rate": 1e-06, "loss": -0.0002, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 407.296875, "completions/mean_terminated_length": 376.03131103515625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3084985613822937, "epoch": 1.4657894736842105, "frac_reward_zero_std": 0.4375, "grad_norm": 0.010170203633606434, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 159586111.0, "reward": 0.6351065635681152, "reward_std": 0.11396625638008118, "rewards/progression_diversity/mean": -9.009381756186485e-05, "rewards/progression_diversity/std": 0.0020385903771966696, "rewards/symbolic_reward_accuracy/mean": 0.62109375, "rewards/symbolic_reward_accuracy/std": 0.4855891764163971, "rewards/symbolic_reward_partial_score/mean": 0.87548828125, "rewards/symbolic_reward_partial_score/std": 0.1809736043214798, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0758986473083496, "sampling/importance_sampling_ratio/min": 2.0304414647398517e-05, "sampling/sampling_logp_difference/max": 10.804672241210938, "sampling/sampling_logp_difference/mean": 0.14537295699119568, "step": 557 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3136870563030243, "epoch": 1.4684210526315788, "grad_norm": 0.00794845912605524, "learning_rate": 1e-06, "loss": 0.0017, "step": 558 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3167266547679901, "epoch": 1.4710526315789474, "grad_norm": 0.009949425235390663, "learning_rate": 1e-06, "loss": -0.0024, "step": 559 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3107505738735199, "epoch": 1.4736842105263157, "grad_norm": 0.01014183834195137, "learning_rate": 1e-06, "loss": 0.0145, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 382.29296875, "completions/mean_terminated_length": 382.29296875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.30869798362255096, "epoch": 1.4763157894736842, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008826439268887043, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 160179317.0, "reward": 0.6139646768569946, "reward_std": 0.11059055477380753, "rewards/progression_diversity/mean": -1.9328768757986836e-05, "rewards/progression_diversity/std": 0.00043736008228734136, "rewards/symbolic_reward_accuracy/mean": 0.591796875, "rewards/symbolic_reward_accuracy/std": 0.49198177456855774, "rewards/symbolic_reward_partial_score/mean": 0.8629557490348816, "rewards/symbolic_reward_partial_score/std": 0.19420644640922546, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768595933914185, "sampling/importance_sampling_ratio/min": 0.00032772633130662143, "sampling/sampling_logp_difference/max": 8.023331642150879, "sampling/sampling_logp_difference/mean": 0.14667022228240967, "step": 561 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3062967211008072, "epoch": 1.4789473684210526, "grad_norm": 0.005198640748858452, "learning_rate": 1e-06, "loss": 0.0001, "step": 562 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31053030490875244, "epoch": 1.481578947368421, "grad_norm": 0.006931893527507782, "learning_rate": 1e-06, "loss": 0.0014, "step": 563 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3084117919206619, "epoch": 1.4842105263157894, "grad_norm": 0.007732919882982969, "learning_rate": 1e-06, "loss": -0.0008, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 367.578125, "completions/mean_terminated_length": 367.578125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "entropy": 0.30998261272907257, "epoch": 1.486842105263158, "frac_reward_zero_std": 0.46875, "grad_norm": 0.00754775432869792, "learning_rate": 1e-06, "loss": -0.0024, "num_tokens": 160771645.0, "reward": 0.6725579500198364, "reward_std": 0.1186295673251152, "rewards/progression_diversity/mean": -7.19197269063443e-05, "rewards/progression_diversity/std": 0.0011566577013581991, "rewards/symbolic_reward_accuracy/mean": 0.669921875, "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, "rewards/symbolic_reward_partial_score/mean": 0.9026693105697632, "rewards/symbolic_reward_partial_score/std": 0.1593107134103775, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0781853199005127, "sampling/importance_sampling_ratio/min": 5.44397971680155e-06, "sampling/sampling_logp_difference/max": 12.121000289916992, "sampling/sampling_logp_difference/mean": 0.14748519659042358, "step": 565 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31304365396499634, "epoch": 1.4894736842105263, "grad_norm": 0.006320515181869268, "learning_rate": 1e-06, "loss": 0.0005, "step": 566 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3075980395078659, "epoch": 1.4921052631578946, "grad_norm": 0.005773784592747688, "learning_rate": 1e-06, "loss": -0.0003, "step": 567 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.31288425624370575, "epoch": 1.4947368421052631, "grad_norm": 0.004835109226405621, "learning_rate": 1e-06, "loss": 0.0013, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 365.66796875, "completions/mean_terminated_length": 365.66796875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.30872373282909393, "epoch": 1.4973684210526317, "frac_reward_zero_std": 0.375, "grad_norm": 0.007975532673299313, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 161374867.0, "reward": 0.5785626173019409, "reward_std": 0.15455234050750732, "rewards/progression_diversity/mean": -0.00018735449702944607, "rewards/progression_diversity/std": 0.004239348694682121, "rewards/symbolic_reward_accuracy/mean": 0.537109375, "rewards/symbolic_reward_accuracy/std": 0.4991086423397064, "rewards/symbolic_reward_partial_score/mean": 0.8543294668197632, "rewards/symbolic_reward_partial_score/std": 0.18278612196445465, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0777958631515503, "sampling/importance_sampling_ratio/min": 0.00035884103272110224, "sampling/sampling_logp_difference/max": 7.932631015777588, "sampling/sampling_logp_difference/mean": 0.1475108563899994, "step": 569 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.30878394842147827, "epoch": 1.5, "grad_norm": 0.006081805098801851, "learning_rate": 1e-06, "loss": 0.0022, "step": 570 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.313975527882576, "epoch": 1.5026315789473683, "grad_norm": 0.004849690943956375, "learning_rate": 1e-06, "loss": -0.0015, "step": 571 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3043152540922165, "epoch": 1.5052631578947369, "grad_norm": 0.006966730579733849, "learning_rate": 1e-06, "loss": -0.0008, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 365.453125, "completions/mean_terminated_length": 365.453125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.3078303337097168, "epoch": 1.5078947368421054, "frac_reward_zero_std": 0.46875, "grad_norm": 0.007786921691149473, "learning_rate": 1e-06, "loss": 0.0021, "num_tokens": 161959771.0, "reward": 0.6698729991912842, "reward_std": 0.09264719486236572, "rewards/progression_diversity/mean": -5.197777682042215e-06, "rewards/progression_diversity/std": 0.00011761228233808652, "rewards/symbolic_reward_accuracy/mean": 0.669921875, "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, "rewards/symbolic_reward_partial_score/mean": 0.89306640625, "rewards/symbolic_reward_partial_score/std": 0.17148670554161072, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0773301124572754, "sampling/importance_sampling_ratio/min": 6.37840139461332e-06, "sampling/sampling_logp_difference/max": 11.962593078613281, "sampling/sampling_logp_difference/mean": 0.14748983085155487, "step": 573 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.30795395374298096, "epoch": 1.5105263157894737, "grad_norm": 0.00861379038542509, "learning_rate": 1e-06, "loss": -0.0002, "step": 574 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3071673512458801, "epoch": 1.513157894736842, "grad_norm": 0.00907305721193552, "learning_rate": 1e-06, "loss": -0.0006, "step": 575 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3147677630186081, "epoch": 1.5157894736842106, "grad_norm": 0.0050981612876057625, "learning_rate": 1e-06, "loss": -0.0009, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 372.0, "completions/mean_terminated_length": 372.0, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "entropy": 0.30765050649642944, "epoch": 1.518421052631579, "frac_reward_zero_std": 0.40625, "grad_norm": 0.008448776789009571, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 162577083.0, "reward": 0.5401855707168579, "reward_std": 0.13993936777114868, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.48046875, "rewards/symbolic_reward_accuracy/std": 0.5001069903373718, "rewards/symbolic_reward_partial_score/mean": 0.8396810293197632, "rewards/symbolic_reward_partial_score/std": 0.17196524143218994, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0772461891174316, "sampling/importance_sampling_ratio/min": 0.0007980514201335609, "sampling/sampling_logp_difference/max": 7.133337497711182, "sampling/sampling_logp_difference/mean": 0.1471158266067505, "step": 577 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31448253989219666, "epoch": 1.5210526315789474, "grad_norm": 0.008604890666902065, "learning_rate": 1e-06, "loss": -0.0024, "step": 578 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30293673276901245, "epoch": 1.5236842105263158, "grad_norm": 0.005941214971244335, "learning_rate": 1e-06, "loss": 0.0031, "step": 579 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.310068815946579, "epoch": 1.526315789473684, "grad_norm": 0.0063788071274757385, "learning_rate": 1e-06, "loss": -0.0006, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 387.646484375, "completions/mean_terminated_length": 356.34246826171875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.30966073274612427, "epoch": 1.5289473684210526, "frac_reward_zero_std": 0.4375, "grad_norm": 0.0067711323499679565, "learning_rate": 1e-06, "loss": -0.0022, "num_tokens": 163170342.0, "reward": 0.6299285888671875, "reward_std": 0.1152401715517044, "rewards/progression_diversity/mean": -0.00030736689222976565, "rewards/progression_diversity/std": 0.006954919081181288, "rewards/symbolic_reward_accuracy/mean": 0.619140625, "rewards/symbolic_reward_accuracy/std": 0.48607301712036133, "rewards/symbolic_reward_partial_score/mean": 0.8621419668197632, "rewards/symbolic_reward_partial_score/std": 0.20800890028476715, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0749588012695312, "sampling/importance_sampling_ratio/min": 6.100111136220221e-07, "sampling/sampling_logp_difference/max": 14.309788703918457, "sampling/sampling_logp_difference/mean": 0.14406411349773407, "step": 581 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.30469025671482086, "epoch": 1.5315789473684212, "grad_norm": 0.007111302111297846, "learning_rate": 1e-06, "loss": 0.0291, "step": 582 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.30858737230300903, "epoch": 1.5342105263157895, "grad_norm": 0.005157460458576679, "learning_rate": 1e-06, "loss": -0.0006, "step": 583 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.30936598777770996, "epoch": 1.5368421052631578, "grad_norm": 0.00526466453447938, "learning_rate": 1e-06, "loss": 0.0044, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 362.904296875, "completions/mean_terminated_length": 362.904296875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3120175153017044, "epoch": 1.5394736842105263, "frac_reward_zero_std": 0.40625, "grad_norm": 0.008821161463856697, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 163783477.0, "reward": 0.6006832122802734, "reward_std": 0.13123157620429993, "rewards/progression_diversity/mean": -4.234274820191786e-05, "rewards/progression_diversity/std": 0.0009581070044077933, "rewards/symbolic_reward_accuracy/mean": 0.5703125, "rewards/symbolic_reward_accuracy/std": 0.4955156147480011, "rewards/symbolic_reward_partial_score/mean": 0.8616536855697632, "rewards/symbolic_reward_partial_score/std": 0.17949894070625305, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0769450664520264, "sampling/importance_sampling_ratio/min": 0.00024724824470467865, "sampling/sampling_logp_difference/max": 8.3051176071167, "sampling/sampling_logp_difference/mean": 0.14837928116321564, "step": 585 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31175045669078827, "epoch": 1.5421052631578949, "grad_norm": 0.007316979113966227, "learning_rate": 1e-06, "loss": -0.0012, "step": 586 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31634584069252014, "epoch": 1.5447368421052632, "grad_norm": 0.006950597278773785, "learning_rate": 1e-06, "loss": 0.0023, "step": 587 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3515625, "entropy": 0.31104812026023865, "epoch": 1.5473684210526315, "grad_norm": 0.00765888299793005, "learning_rate": 1e-06, "loss": 0.0003, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 355.30078125, "completions/mean_terminated_length": 355.30078125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.30843158066272736, "epoch": 1.55, "frac_reward_zero_std": 0.53125, "grad_norm": 0.009021712467074394, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 164348847.0, "reward": 0.6466308832168579, "reward_std": 0.09964179992675781, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.640625, "rewards/symbolic_reward_accuracy/std": 0.48028653860092163, "rewards/symbolic_reward_partial_score/mean": 0.8741861581802368, "rewards/symbolic_reward_partial_score/std": 0.19597046077251434, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768463611602783, "sampling/importance_sampling_ratio/min": 0.0002325253444723785, "sampling/sampling_logp_difference/max": 8.366511344909668, "sampling/sampling_logp_difference/mean": 0.14811888337135315, "step": 589 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.31392180919647217, "epoch": 1.5526315789473686, "grad_norm": 0.005683423485606909, "learning_rate": 1e-06, "loss": -0.0008, "step": 590 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31084367632865906, "epoch": 1.555263157894737, "grad_norm": 0.011430696584284306, "learning_rate": 1e-06, "loss": 0.0014, "step": 591 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3137829303741455, "epoch": 1.5578947368421052, "grad_norm": 0.0035277451388537884, "learning_rate": 1e-06, "loss": -0.001, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 351.935546875, "completions/mean_terminated_length": 351.935546875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.31604452431201935, "epoch": 1.5605263157894735, "frac_reward_zero_std": 0.3125, "grad_norm": 0.01125564705580473, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 164937198.0, "reward": 0.6098132133483887, "reward_std": 0.16237008571624756, "rewards/progression_diversity/mean": -0.00012535469431895763, "rewards/progression_diversity/std": 0.0018754107877612114, "rewards/symbolic_reward_accuracy/mean": 0.583984375, "rewards/symbolic_reward_accuracy/std": 0.493378221988678, "rewards/symbolic_reward_partial_score/mean": 0.86474609375, "rewards/symbolic_reward_partial_score/std": 0.1801557093858719, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078462839126587, "sampling/importance_sampling_ratio/min": 4.561973582895007e-06, "sampling/sampling_logp_difference/max": 12.297755241394043, "sampling/sampling_logp_difference/mean": 0.14773216843605042, "step": 593 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.15625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.30913083255290985, "epoch": 1.563157894736842, "grad_norm": 0.007575163152068853, "learning_rate": 1e-06, "loss": -0.0019, "step": 594 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.310193806886673, "epoch": 1.5657894736842106, "grad_norm": 0.007082466036081314, "learning_rate": 1e-06, "loss": 0.0004, "step": 595 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3109205365180969, "epoch": 1.568421052631579, "grad_norm": 0.008095265366137028, "learning_rate": 1e-06, "loss": 0.0021, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 355.970703125, "completions/mean_terminated_length": 355.970703125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.3134462833404541, "epoch": 1.5710526315789473, "frac_reward_zero_std": 0.3125, "grad_norm": 0.012334701605141163, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 165526111.0, "reward": 0.64453125, "reward_std": 0.167199969291687, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.63671875, "rewards/symbolic_reward_accuracy/std": 0.4814152419567108, "rewards/symbolic_reward_partial_score/mean": 0.875, "rewards/symbolic_reward_partial_score/std": 0.18969999253749847, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0783336162567139, "sampling/importance_sampling_ratio/min": 2.1293792542564915e-06, "sampling/sampling_logp_difference/max": 13.059679985046387, "sampling/sampling_logp_difference/mean": 0.14881224930286407, "step": 597 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.316374272108078, "epoch": 1.5736842105263158, "grad_norm": 0.008729521185159683, "learning_rate": 1e-06, "loss": -0.0011, "step": 598 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.34375, "entropy": 0.30681292712688446, "epoch": 1.5763157894736843, "grad_norm": 0.0042996518313884735, "learning_rate": 1e-06, "loss": -0.0017, "step": 599 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1953125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.31428535282611847, "epoch": 1.5789473684210527, "grad_norm": 0.008006506599485874, "learning_rate": 1e-06, "loss": 0.0029, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 352.412109375, "completions/mean_terminated_length": 352.412109375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.30341216921806335, "epoch": 1.581578947368421, "frac_reward_zero_std": 0.34375, "grad_norm": 0.012584522366523743, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 166133714.0, "reward": 0.6537594795227051, "reward_std": 0.15127253532409668, "rewards/progression_diversity/mean": -3.323890268802643e-05, "rewards/progression_diversity/std": 0.0007521104998886585, "rewards/symbolic_reward_accuracy/mean": 0.646484375, "rewards/symbolic_reward_accuracy/std": 0.47852855920791626, "rewards/symbolic_reward_partial_score/mean": 0.8868814706802368, "rewards/symbolic_reward_partial_score/std": 0.17624127864837646, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0774664878845215, "sampling/importance_sampling_ratio/min": 0.0012332568876445293, "sampling/sampling_logp_difference/max": 6.698096752166748, "sampling/sampling_logp_difference/mean": 0.14874780178070068, "step": 601 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31365664303302765, "epoch": 1.5842105263157895, "grad_norm": 0.008380788378417492, "learning_rate": 1e-06, "loss": -0.0003, "step": 602 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3135155141353607, "epoch": 1.586842105263158, "grad_norm": 0.006163998506963253, "learning_rate": 1e-06, "loss": 0.001, "step": 603 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.316244512796402, "epoch": 1.5894736842105264, "grad_norm": 0.009533174335956573, "learning_rate": 1e-06, "loss": -0.0029, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 352.10546875, "completions/mean_terminated_length": 352.10546875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.30988338589668274, "epoch": 1.5921052631578947, "frac_reward_zero_std": 0.5, "grad_norm": 0.009614565409719944, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 166705288.0, "reward": 0.6556152105331421, "reward_std": 0.14926879107952118, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.642578125, "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, "rewards/symbolic_reward_partial_score/mean": 0.9002279043197632, "rewards/symbolic_reward_partial_score/std": 0.15159675478935242, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0770351886749268, "sampling/importance_sampling_ratio/min": 7.399953756248578e-05, "sampling/sampling_logp_difference/max": 9.511451721191406, "sampling/sampling_logp_difference/mean": 0.14816106855869293, "step": 605 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3066626638174057, "epoch": 1.594736842105263, "grad_norm": 0.006337527651339769, "learning_rate": 1e-06, "loss": -0.0009, "step": 606 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3137834966182709, "epoch": 1.5973684210526315, "grad_norm": 0.0055765085853636265, "learning_rate": 1e-06, "loss": 0.0012, "step": 607 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31103527545928955, "epoch": 1.6, "grad_norm": 0.0059480974450707436, "learning_rate": 1e-06, "loss": -0.0005, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 350.796875, "completions/mean_terminated_length": 350.796875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.3112498074769974, "epoch": 1.6026315789473684, "frac_reward_zero_std": 0.5625, "grad_norm": 0.00678864074870944, "learning_rate": 1e-06, "loss": 0.0013, "num_tokens": 167282688.0, "reward": 0.6035632491111755, "reward_std": 0.06579291820526123, "rewards/progression_diversity/mean": -0.00012304651318117976, "rewards/progression_diversity/std": 0.0026907166466116905, "rewards/symbolic_reward_accuracy/mean": 0.57421875, "rewards/symbolic_reward_accuracy/std": 0.4949444830417633, "rewards/symbolic_reward_partial_score/mean": 0.8634439706802368, "rewards/symbolic_reward_partial_score/std": 0.17649410665035248, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077202320098877, "sampling/importance_sampling_ratio/min": 0.0004482800140976906, "sampling/sampling_logp_difference/max": 7.710092544555664, "sampling/sampling_logp_difference/mean": 0.14757178723812103, "step": 609 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3036888688802719, "epoch": 1.6052631578947367, "grad_norm": 0.006577013060450554, "learning_rate": 1e-06, "loss": 0.0008, "step": 610 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31006863713264465, "epoch": 1.6078947368421053, "grad_norm": 0.005068251863121986, "learning_rate": 1e-06, "loss": -0.0019, "step": 611 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.30943046510219574, "epoch": 1.6105263157894738, "grad_norm": 0.006940970662981272, "learning_rate": 1e-06, "loss": -0.0003, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 348.203125, "completions/mean_terminated_length": 348.203125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "entropy": 0.30800144374370575, "epoch": 1.6131578947368421, "frac_reward_zero_std": 0.46875, "grad_norm": 0.010091869160532951, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 167850440.0, "reward": 0.6912109851837158, "reward_std": 0.1221666932106018, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.705078125, "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, "rewards/symbolic_reward_partial_score/mean": 0.8938802480697632, "rewards/symbolic_reward_partial_score/std": 0.1832769364118576, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0772395133972168, "sampling/importance_sampling_ratio/min": 0.00012781730038113892, "sampling/sampling_logp_difference/max": 8.964908599853516, "sampling/sampling_logp_difference/mean": 0.14840662479400635, "step": 613 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.30726876854896545, "epoch": 1.6157894736842104, "grad_norm": 0.006161559373140335, "learning_rate": 1e-06, "loss": -0.0017, "step": 614 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3138604611158371, "epoch": 1.618421052631579, "grad_norm": 0.00567243155092001, "learning_rate": 1e-06, "loss": -0.0006, "step": 615 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31249263882637024, "epoch": 1.6210526315789475, "grad_norm": 0.0038140625692903996, "learning_rate": 1e-06, "loss": 0.0024, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 340.693359375, "completions/mean_terminated_length": 340.693359375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.30807264149188995, "epoch": 1.6236842105263158, "frac_reward_zero_std": 0.34375, "grad_norm": 0.009525323286652565, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 168432171.0, "reward": 0.64228355884552, "reward_std": 0.11010335385799408, "rewards/progression_diversity/mean": -0.00016110966680571437, "rewards/progression_diversity/std": 0.002038724021986127, "rewards/symbolic_reward_accuracy/mean": 0.626953125, "rewards/symbolic_reward_accuracy/std": 0.48408737778663635, "rewards/symbolic_reward_partial_score/mean": 0.8870443105697632, "rewards/symbolic_reward_partial_score/std": 0.16728946566581726, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0766582489013672, "sampling/importance_sampling_ratio/min": 3.1295207008952275e-05, "sampling/sampling_logp_difference/max": 10.372045516967773, "sampling/sampling_logp_difference/mean": 0.14576473832130432, "step": 617 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3076433092355728, "epoch": 1.6263157894736842, "grad_norm": 0.005641128867864609, "learning_rate": 1e-06, "loss": 0.0004, "step": 618 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.30672571063041687, "epoch": 1.6289473684210525, "grad_norm": 0.00586570193991065, "learning_rate": 1e-06, "loss": -0.001, "step": 619 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3051045835018158, "epoch": 1.631578947368421, "grad_norm": 0.006130191031843424, "learning_rate": 1e-06, "loss": -0.0014, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 344.671875, "completions/mean_terminated_length": 344.671875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3110572397708893, "epoch": 1.6342105263157896, "frac_reward_zero_std": 0.5, "grad_norm": 0.006634837947785854, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 169024803.0, "reward": 0.6852539777755737, "reward_std": 0.10974645614624023, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.69140625, "rewards/symbolic_reward_accuracy/std": 0.4623647928237915, "rewards/symbolic_reward_partial_score/mean": 0.9013671875, "rewards/symbolic_reward_partial_score/std": 0.1646461933851242, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0777833461761475, "sampling/importance_sampling_ratio/min": 0.0019899923354387283, "sampling/sampling_logp_difference/max": 6.2196245193481445, "sampling/sampling_logp_difference/mean": 0.14576740562915802, "step": 621 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3100120574235916, "epoch": 1.6368421052631579, "grad_norm": 0.004018013831228018, "learning_rate": 1e-06, "loss": 0.001, "step": 622 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.31171390414237976, "epoch": 1.6394736842105262, "grad_norm": 0.008900276385247707, "learning_rate": 1e-06, "loss": 0.0015, "step": 623 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.30071602761745453, "epoch": 1.6421052631578947, "grad_norm": 0.007619917392730713, "learning_rate": 1e-06, "loss": -0.0007, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 352.658203125, "completions/mean_terminated_length": 352.658203125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3088865578174591, "epoch": 1.6447368421052633, "frac_reward_zero_std": 0.625, "grad_norm": 0.006969306152313948, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 169577812.0, "reward": 0.7381347417831421, "reward_std": 0.09588056802749634, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.763671875, "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, "rewards/symbolic_reward_partial_score/mean": 0.93310546875, "rewards/symbolic_reward_partial_score/std": 0.12945035099983215, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0774608850479126, "sampling/importance_sampling_ratio/min": 0.0002049918402917683, "sampling/sampling_logp_difference/max": 8.49254035949707, "sampling/sampling_logp_difference/mean": 0.14746268093585968, "step": 625 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31198354065418243, "epoch": 1.6473684210526316, "grad_norm": 0.005133276339620352, "learning_rate": 1e-06, "loss": 0.0018, "step": 626 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3059317022562027, "epoch": 1.65, "grad_norm": 0.005952898412942886, "learning_rate": 1e-06, "loss": -0.0007, "step": 627 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3020048588514328, "epoch": 1.6526315789473685, "grad_norm": 0.0032144656870514154, "learning_rate": 1e-06, "loss": -0.0015, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 343.083984375, "completions/mean_terminated_length": 343.083984375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3132614195346832, "epoch": 1.655263157894737, "frac_reward_zero_std": 0.53125, "grad_norm": 0.004867217969149351, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 170130591.0, "reward": 0.693359375, "reward_std": 0.11107412725687027, "rewards/progression_diversity/mean": -1.2624410601347336e-06, "rewards/progression_diversity/std": 2.8565777029143646e-05, "rewards/symbolic_reward_accuracy/mean": 0.712890625, "rewards/symbolic_reward_accuracy/std": 0.45285552740097046, "rewards/symbolic_reward_partial_score/mean": 0.8854166865348816, "rewards/symbolic_reward_partial_score/std": 0.20853707194328308, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077362298965454, "sampling/importance_sampling_ratio/min": 3.827792897936888e-05, "sampling/sampling_logp_difference/max": 10.170637130737305, "sampling/sampling_logp_difference/mean": 0.14775975048542023, "step": 629 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3046818673610687, "epoch": 1.6578947368421053, "grad_norm": 0.003144451417028904, "learning_rate": 1e-06, "loss": -0.0004, "step": 630 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31304094195365906, "epoch": 1.6605263157894736, "grad_norm": 0.006349779199808836, "learning_rate": 1e-06, "loss": -0.0011, "step": 631 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31196820735931396, "epoch": 1.663157894736842, "grad_norm": 0.008312534540891647, "learning_rate": 1e-06, "loss": 0.0017, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 355.884765625, "completions/mean_terminated_length": 355.884765625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.30678650736808777, "epoch": 1.6657894736842105, "frac_reward_zero_std": 0.5, "grad_norm": 0.010083966888487339, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 170696260.0, "reward": 0.7260736227035522, "reward_std": 0.12834513187408447, "rewards/progression_diversity/mean": -6.044433393981308e-05, "rewards/progression_diversity/std": 0.001367699122056365, "rewards/symbolic_reward_accuracy/mean": 0.751953125, "rewards/symbolic_reward_accuracy/std": 0.4323015511035919, "rewards/symbolic_reward_partial_score/mean": 0.9163411855697632, "rewards/symbolic_reward_partial_score/std": 0.16328899562358856, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0770810842514038, "sampling/importance_sampling_ratio/min": 0.003595878602936864, "sampling/sampling_logp_difference/max": 5.62796688079834, "sampling/sampling_logp_difference/mean": 0.14828580617904663, "step": 633 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.314095139503479, "epoch": 1.668421052631579, "grad_norm": 0.0036648185923695564, "learning_rate": 1e-06, "loss": -0.0013, "step": 634 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31373080611228943, "epoch": 1.6710526315789473, "grad_norm": 0.003688375698402524, "learning_rate": 1e-06, "loss": -0.0001, "step": 635 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.30743128061294556, "epoch": 1.6736842105263157, "grad_norm": 0.0059085749089717865, "learning_rate": 1e-06, "loss": 0.0009, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "entropy": 0.3069046139717102, "epoch": 1.6763157894736842, "frac_reward_zero_std": 0.46875, "grad_norm": 0.006045298185199499, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 171266692.0, "reward": 0.7287590503692627, "reward_std": 0.10971052944660187, "rewards/progression_diversity/mean": -7.262287545017898e-05, "rewards/progression_diversity/std": 0.0010342065943405032, "rewards/symbolic_reward_accuracy/mean": 0.7578125, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.91357421875, "rewards/symbolic_reward_partial_score/std": 0.17370551824569702, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0787405967712402, "sampling/importance_sampling_ratio/min": 6.20439004705986e-06, "sampling/sampling_logp_difference/max": 11.990253448486328, "sampling/sampling_logp_difference/mean": 0.14685004949569702, "step": 637 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3091466426849365, "epoch": 1.6789473684210527, "grad_norm": 0.007839719764888287, "learning_rate": 1e-06, "loss": 0.0003, "step": 638 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.30954572558403015, "epoch": 1.681578947368421, "grad_norm": 0.00465408293530345, "learning_rate": 1e-06, "loss": -0.0002, "step": 639 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3108525574207306, "epoch": 1.6842105263157894, "grad_norm": 0.005443122237920761, "learning_rate": 1e-06, "loss": -0.0004, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 346.2265625, "completions/mean_terminated_length": 346.2265625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "entropy": 0.3054659068584442, "epoch": 1.686842105263158, "frac_reward_zero_std": 0.34375, "grad_norm": 0.012117248959839344, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 171841752.0, "reward": 0.6958000659942627, "reward_std": 0.11660270392894745, "rewards/progression_diversity/mean": -7.265746535267681e-05, "rewards/progression_diversity/std": 0.0009757342631928623, "rewards/symbolic_reward_accuracy/mean": 0.705078125, "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, "rewards/symbolic_reward_partial_score/mean": 0.9091796875, "rewards/symbolic_reward_partial_score/std": 0.1629505455493927, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0758869647979736, "sampling/importance_sampling_ratio/min": 6.6611269176064525e-06, "sampling/sampling_logp_difference/max": 11.919221878051758, "sampling/sampling_logp_difference/mean": 0.1453857272863388, "step": 641 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.234375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3011976033449173, "epoch": 1.6894736842105265, "grad_norm": 0.007454789709299803, "learning_rate": 1e-06, "loss": -0.0012, "step": 642 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.31143735349178314, "epoch": 1.6921052631578948, "grad_norm": 0.005452565383166075, "learning_rate": 1e-06, "loss": -0.001, "step": 643 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2890625, "entropy": 0.30266809463500977, "epoch": 1.694736842105263, "grad_norm": 0.00574772572144866, "learning_rate": 1e-06, "loss": 0.0012, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 355.8515625, "completions/mean_terminated_length": 355.8515625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "entropy": 0.3147277981042862, "epoch": 1.6973684210526314, "frac_reward_zero_std": 0.46875, "grad_norm": 0.007081964984536171, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 172435436.0, "reward": 0.5885741710662842, "reward_std": 0.10568203777074814, "rewards/progression_diversity/mean": -9.049794243765064e-06, "rewards/progression_diversity/std": 0.0002047734596999362, "rewards/symbolic_reward_accuracy/mean": 0.552734375, "rewards/symbolic_reward_accuracy/std": 0.4976975917816162, "rewards/symbolic_reward_partial_score/mean": 0.8564453125, "rewards/symbolic_reward_partial_score/std": 0.18515437841415405, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0765762329101562, "sampling/importance_sampling_ratio/min": 5.255365977063775e-05, "sampling/sampling_logp_difference/max": 9.853675842285156, "sampling/sampling_logp_difference/mean": 0.146433487534523, "step": 645 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.31117910146713257, "epoch": 1.7, "grad_norm": 0.004298560321331024, "learning_rate": 1e-06, "loss": 0.0009, "step": 646 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3119800388813019, "epoch": 1.7026315789473685, "grad_norm": 0.006341192871332169, "learning_rate": 1e-06, "loss": -0.0015, "step": 647 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1640625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3077608495950699, "epoch": 1.7052631578947368, "grad_norm": 0.004039146471768618, "learning_rate": 1e-06, "loss": 0.0024, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 361.634765625, "completions/mean_terminated_length": 361.634765625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "entropy": 0.30650724470615387, "epoch": 1.7078947368421051, "frac_reward_zero_std": 0.59375, "grad_norm": 0.007845344953238964, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 173023057.0, "reward": 0.6627441644668579, "reward_std": 0.07411643862724304, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.65625, "rewards/symbolic_reward_accuracy/std": 0.4754233956336975, "rewards/symbolic_reward_partial_score/mean": 0.8966470956802368, "rewards/symbolic_reward_partial_score/std": 0.16268277168273926, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0776004791259766, "sampling/importance_sampling_ratio/min": 0.00023482624965254217, "sampling/sampling_logp_difference/max": 8.356664657592773, "sampling/sampling_logp_difference/mean": 0.14773426949977875, "step": 649 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31050027906894684, "epoch": 1.7105263157894737, "grad_norm": 0.0048981113359332085, "learning_rate": 1e-06, "loss": 0.0006, "step": 650 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.31170788407325745, "epoch": 1.7131578947368422, "grad_norm": 0.004619154147803783, "learning_rate": 1e-06, "loss": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3152112662792206, "epoch": 1.7157894736842105, "grad_norm": 0.0038981193210929632, "learning_rate": 1e-06, "loss": -0.0013, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 671.0, "completions/max_terminated_length": 671.0, "completions/mean_length": 349.685546875, "completions/mean_terminated_length": 349.685546875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.31783896684646606, "epoch": 1.7184210526315788, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0073747457936406136, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 173585712.0, "reward": 0.7288573384284973, "reward_std": 0.06641851365566254, "rewards/progression_diversity/mean": -1.288700968871126e-05, "rewards/progression_diversity/std": 0.00029159971745684743, "rewards/symbolic_reward_accuracy/mean": 0.748046875, "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, "rewards/symbolic_reward_partial_score/mean": 0.9334309697151184, "rewards/symbolic_reward_partial_score/std": 0.13425354659557343, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0794637203216553, "sampling/importance_sampling_ratio/min": 5.184490703413758e-08, "sampling/sampling_logp_difference/max": 16.775009155273438, "sampling/sampling_logp_difference/mean": 0.15089115500450134, "step": 653 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.31695644557476044, "epoch": 1.7210526315789474, "grad_norm": 0.0054539949633181095, "learning_rate": 1e-06, "loss": 0.0005, "step": 654 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3197050541639328, "epoch": 1.723684210526316, "grad_norm": 0.005290233064442873, "learning_rate": 1e-06, "loss": -0.0011, "step": 655 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3151402175426483, "epoch": 1.7263157894736842, "grad_norm": 0.009018939919769764, "learning_rate": 1e-06, "loss": 0.0007, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 348.197265625, "completions/mean_terminated_length": 348.197265625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.3115183562040329, "epoch": 1.7289473684210526, "frac_reward_zero_std": 0.46875, "grad_norm": 0.008836961351335049, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 174152597.0, "reward": 0.703369140625, "reward_std": 0.11740782856941223, "rewards/progression_diversity/mean": -5.42844645678997e-06, "rewards/progression_diversity/std": 0.00012283171236049384, "rewards/symbolic_reward_accuracy/mean": 0.72265625, "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, "rewards/symbolic_reward_partial_score/mean": 0.8992513418197632, "rewards/symbolic_reward_partial_score/std": 0.18293997645378113, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0789730548858643, "sampling/importance_sampling_ratio/min": 4.3294774513924494e-05, "sampling/sampling_logp_difference/max": 10.047478675842285, "sampling/sampling_logp_difference/mean": 0.14934486150741577, "step": 657 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31983131170272827, "epoch": 1.731578947368421, "grad_norm": 0.006589457858353853, "learning_rate": 1e-06, "loss": 0.0011, "step": 658 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3190360963344574, "epoch": 1.7342105263157894, "grad_norm": 0.004673468880355358, "learning_rate": 1e-06, "loss": -0.0011, "step": 659 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31859441101551056, "epoch": 1.736842105263158, "grad_norm": 0.008711190894246101, "learning_rate": 1e-06, "loss": 0.0022, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 364.14453125, "completions/mean_terminated_length": 364.14453125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.30695727467536926, "epoch": 1.7394736842105263, "frac_reward_zero_std": 0.46875, "grad_norm": 0.00819223653525114, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 174727327.0, "reward": 0.6963849067687988, "reward_std": 0.10175984352827072, "rewards/progression_diversity/mean": -0.00018321775132790208, "rewards/progression_diversity/std": 0.002992808585986495, "rewards/symbolic_reward_accuracy/mean": 0.7109375, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.8994140625, "rewards/symbolic_reward_partial_score/std": 0.16723868250846863, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077476143836975, "sampling/importance_sampling_ratio/min": 0.0003383801376912743, "sampling/sampling_logp_difference/max": 7.991340637207031, "sampling/sampling_logp_difference/mean": 0.14830249547958374, "step": 661 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3172942250967026, "epoch": 1.7421052631578946, "grad_norm": 0.0050326017662882805, "learning_rate": 1e-06, "loss": -0.0005, "step": 662 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31450478732585907, "epoch": 1.7447368421052631, "grad_norm": 0.004655744414776564, "learning_rate": 1e-06, "loss": 0.0001, "step": 663 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.31187137961387634, "epoch": 1.7473684210526317, "grad_norm": 0.007312784902751446, "learning_rate": 1e-06, "loss": 0.0007, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 361.953125, "completions/mean_terminated_length": 361.953125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "entropy": 0.312186524271965, "epoch": 1.75, "frac_reward_zero_std": 0.53125, "grad_norm": 0.006459662225097418, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 175310279.0, "reward": 0.6182616949081421, "reward_std": 0.08476275205612183, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.595703125, "rewards/symbolic_reward_accuracy/std": 0.4912354052066803, "rewards/symbolic_reward_partial_score/mean": 0.8694661855697632, "rewards/symbolic_reward_partial_score/std": 0.1792290210723877, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077375054359436, "sampling/importance_sampling_ratio/min": 0.00022012810222804546, "sampling/sampling_logp_difference/max": 8.421300888061523, "sampling/sampling_logp_difference/mean": 0.14795972406864166, "step": 665 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3078509271144867, "epoch": 1.7526315789473683, "grad_norm": 0.007247635163366795, "learning_rate": 1e-06, "loss": 0.0003, "step": 666 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3109118938446045, "epoch": 1.7552631578947369, "grad_norm": 0.005482961889356375, "learning_rate": 1e-06, "loss": -0.0007, "step": 667 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3142966479063034, "epoch": 1.7578947368421054, "grad_norm": 0.005572004709392786, "learning_rate": 1e-06, "loss": 0.0014, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 355.5546875, "completions/mean_terminated_length": 355.5546875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "entropy": 0.3197452425956726, "epoch": 1.7605263157894737, "frac_reward_zero_std": 0.5625, "grad_norm": 0.005338034126907587, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 175890979.0, "reward": 0.6089342832565308, "reward_std": 0.10050010681152344, "rewards/progression_diversity/mean": -0.00012931314995512366, "rewards/progression_diversity/std": 0.00251020141877234, "rewards/symbolic_reward_accuracy/mean": 0.580078125, "rewards/symbolic_reward_accuracy/std": 0.4940285086631775, "rewards/symbolic_reward_partial_score/mean": 0.86962890625, "rewards/symbolic_reward_partial_score/std": 0.17219598591327667, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0790578126907349, "sampling/importance_sampling_ratio/min": 0.0004860071639996022, "sampling/sampling_logp_difference/max": 7.629287242889404, "sampling/sampling_logp_difference/mean": 0.15068799257278442, "step": 669 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3192948251962662, "epoch": 1.763157894736842, "grad_norm": 0.009029190056025982, "learning_rate": 1e-06, "loss": 0.0004, "step": 670 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.32492591440677643, "epoch": 1.7657894736842106, "grad_norm": 0.004878129344433546, "learning_rate": 1e-06, "loss": -0.0001, "step": 671 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.31461015343666077, "epoch": 1.768421052631579, "grad_norm": 0.00928495917469263, "learning_rate": 1e-06, "loss": -0.0003, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 353.5234375, "completions/mean_terminated_length": 353.5234375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "entropy": 0.3127034604549408, "epoch": 1.7710526315789474, "frac_reward_zero_std": 0.5, "grad_norm": 0.012209014035761356, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 176469775.0, "reward": 0.7016580104827881, "reward_std": 0.08149384707212448, "rewards/progression_diversity/mean": -0.0002171548258047551, "rewards/progression_diversity/std": 0.004398517310619354, "rewards/symbolic_reward_accuracy/mean": 0.7109375, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.9169921875, "rewards/symbolic_reward_partial_score/std": 0.15359680354595184, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0774240493774414, "sampling/importance_sampling_ratio/min": 4.304129674892465e-07, "sampling/sampling_logp_difference/max": 14.658520698547363, "sampling/sampling_logp_difference/mean": 0.14875832200050354, "step": 673 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.31441599130630493, "epoch": 1.7736842105263158, "grad_norm": 0.006872444413602352, "learning_rate": 1e-06, "loss": 0.0009, "step": 674 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3115265518426895, "epoch": 1.776315789473684, "grad_norm": 0.0034778716508299112, "learning_rate": 1e-06, "loss": -0.0001, "step": 675 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31803882122039795, "epoch": 1.7789473684210526, "grad_norm": 0.007153674028813839, "learning_rate": 1e-06, "loss": 0.0002, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 365.19921875, "completions/mean_terminated_length": 365.19921875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3086838871240616, "epoch": 1.7815789473684212, "frac_reward_zero_std": 0.53125, "grad_norm": 0.005677321460098028, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 177066901.0, "reward": 0.6431640386581421, "reward_std": 0.08386066555976868, "rewards/progression_diversity/mean": -2.6685718239605194e-06, "rewards/progression_diversity/std": 6.03828884777613e-05, "rewards/symbolic_reward_accuracy/mean": 0.63671875, "rewards/symbolic_reward_accuracy/std": 0.4814152419567108, "rewards/symbolic_reward_partial_score/mean": 0.8704427480697632, "rewards/symbolic_reward_partial_score/std": 0.19743919372558594, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0781702995300293, "sampling/importance_sampling_ratio/min": 0.00019859473104588687, "sampling/sampling_logp_difference/max": 8.52424430847168, "sampling/sampling_logp_difference/mean": 0.14643803238868713, "step": 677 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3097618967294693, "epoch": 1.7842105263157895, "grad_norm": 0.008383281528949738, "learning_rate": 1e-06, "loss": 0.0007, "step": 678 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3085528761148453, "epoch": 1.7868421052631578, "grad_norm": 0.004206513985991478, "learning_rate": 1e-06, "loss": -0.0009, "step": 679 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3110394924879074, "epoch": 1.7894736842105263, "grad_norm": 0.0055688670836389065, "learning_rate": 1e-06, "loss": 0.0007, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 364.806640625, "completions/mean_terminated_length": 364.806640625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.31680648028850555, "epoch": 1.7921052631578949, "frac_reward_zero_std": 0.46875, "grad_norm": 0.010036691091954708, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 177653842.0, "reward": 0.6611816883087158, "reward_std": 0.1240668073296547, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.666015625, "rewards/symbolic_reward_accuracy/std": 0.47209542989730835, "rewards/symbolic_reward_partial_score/mean": 0.87255859375, "rewards/symbolic_reward_partial_score/std": 0.20477424561977386, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0788350105285645, "sampling/importance_sampling_ratio/min": 0.0001117745487135835, "sampling/sampling_logp_difference/max": 9.099026679992676, "sampling/sampling_logp_difference/mean": 0.14859601855278015, "step": 681 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31431975960731506, "epoch": 1.7947368421052632, "grad_norm": 0.006399835925549269, "learning_rate": 1e-06, "loss": 0.0008, "step": 682 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3147125542163849, "epoch": 1.7973684210526315, "grad_norm": 0.007597615476697683, "learning_rate": 1e-06, "loss": 0.0009, "step": 683 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3126670718193054, "epoch": 1.8, "grad_norm": 0.004449205473065376, "learning_rate": 1e-06, "loss": -0.0023, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 375.79296875, "completions/mean_terminated_length": 375.79296875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.30589479207992554, "epoch": 1.8026315789473686, "frac_reward_zero_std": 0.53125, "grad_norm": 0.009832125157117844, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 178257416.0, "reward": 0.690575897693634, "reward_std": 0.09389779716730118, "rewards/progression_diversity/mean": -3.32641793647781e-05, "rewards/progression_diversity/std": 0.0005181218730285764, "rewards/symbolic_reward_accuracy/mean": 0.703125, "rewards/symbolic_reward_accuracy/std": 0.45732781291007996, "rewards/symbolic_reward_partial_score/mean": 0.8963216543197632, "rewards/symbolic_reward_partial_score/std": 0.17704488337039948, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0771936178207397, "sampling/importance_sampling_ratio/min": 0.0028477907180786133, "sampling/sampling_logp_difference/max": 5.861211776733398, "sampling/sampling_logp_difference/mean": 0.14920267462730408, "step": 685 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31586427986621857, "epoch": 1.805263157894737, "grad_norm": 0.0033005087170749903, "learning_rate": 1e-06, "loss": -0.0009, "step": 686 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3129272162914276, "epoch": 1.8078947368421052, "grad_norm": 0.0028064732905477285, "learning_rate": 1e-06, "loss": 0.0027, "step": 687 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3123166412115097, "epoch": 1.8105263157894735, "grad_norm": 0.003090722020715475, "learning_rate": 1e-06, "loss": 0.0004, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 374.056640625, "completions/mean_terminated_length": 374.056640625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3101574629545212, "epoch": 1.813157894736842, "frac_reward_zero_std": 0.59375, "grad_norm": 0.007430217228829861, "learning_rate": 1e-06, "loss": -0.0018, "num_tokens": 178852581.0, "reward": 0.6585937142372131, "reward_std": 0.06606994569301605, "rewards/progression_diversity/mean": -8.257005902123637e-06, "rewards/progression_diversity/std": 0.00018683471716940403, "rewards/symbolic_reward_accuracy/mean": 0.65625, "rewards/symbolic_reward_accuracy/std": 0.4754233956336975, "rewards/symbolic_reward_partial_score/mean": 0.8828125, "rewards/symbolic_reward_partial_score/std": 0.174610897898674, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0775363445281982, "sampling/importance_sampling_ratio/min": 0.0027425093576312065, "sampling/sampling_logp_difference/max": 5.898881912231445, "sampling/sampling_logp_difference/mean": 0.14803138375282288, "step": 689 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.31864188611507416, "epoch": 1.8157894736842106, "grad_norm": 0.0023194809909909964, "learning_rate": 1e-06, "loss": 0.0004, "step": 690 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3088736832141876, "epoch": 1.818421052631579, "grad_norm": 0.004817016888409853, "learning_rate": 1e-06, "loss": 0.0014, "step": 691 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.31209132075309753, "epoch": 1.8210526315789473, "grad_norm": 0.005796011071652174, "learning_rate": 1e-06, "loss": 0.0002, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 386.078125, "completions/mean_terminated_length": 386.078125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "entropy": 0.30596648156642914, "epoch": 1.8236842105263158, "frac_reward_zero_std": 0.59375, "grad_norm": 0.009517088532447815, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 179452397.0, "reward": 0.6856445670127869, "reward_std": 0.09474059194326401, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.693359375, "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, "rewards/symbolic_reward_partial_score/mean": 0.8987630605697632, "rewards/symbolic_reward_partial_score/std": 0.16487817466259003, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0773887634277344, "sampling/importance_sampling_ratio/min": 0.006162859965115786, "sampling/sampling_logp_difference/max": 5.089214324951172, "sampling/sampling_logp_difference/mean": 0.14641115069389343, "step": 693 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3115035742521286, "epoch": 1.8263157894736843, "grad_norm": 0.006516161374747753, "learning_rate": 1e-06, "loss": -0.0015, "step": 694 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.30916786193847656, "epoch": 1.8289473684210527, "grad_norm": 0.005851257126778364, "learning_rate": 1e-06, "loss": -0.0006, "step": 695 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3106692433357239, "epoch": 1.831578947368421, "grad_norm": 0.005098440684378147, "learning_rate": 1e-06, "loss": 0.0011, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 380.732421875, "completions/mean_terminated_length": 380.732421875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.31172554194927216, "epoch": 1.8342105263157895, "frac_reward_zero_std": 0.40625, "grad_norm": 0.008900835178792477, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 180043140.0, "reward": 0.690136194229126, "reward_std": 0.1291923224925995, "rewards/progression_diversity/mean": -5.790265277028084e-05, "rewards/progression_diversity/std": 0.0011140767019242048, "rewards/symbolic_reward_accuracy/mean": 0.701171875, "rewards/symbolic_reward_accuracy/std": 0.45819199085235596, "rewards/symbolic_reward_partial_score/mean": 0.8987630605697632, "rewards/symbolic_reward_partial_score/std": 0.1765023171901703, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0769742727279663, "sampling/importance_sampling_ratio/min": 5.942506959399907e-06, "sampling/sampling_logp_difference/max": 12.033379554748535, "sampling/sampling_logp_difference/mean": 0.1485772430896759, "step": 697 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31369997560977936, "epoch": 1.836842105263158, "grad_norm": 0.007809172384440899, "learning_rate": 1e-06, "loss": -0.0002, "step": 698 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31465624272823334, "epoch": 1.8394736842105264, "grad_norm": 0.009294657967984676, "learning_rate": 1e-06, "loss": 0.0006, "step": 699 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3038342297077179, "epoch": 1.8421052631578947, "grad_norm": 0.011830865405499935, "learning_rate": 1e-06, "loss": 0.0012, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 385.259765625, "completions/mean_terminated_length": 385.259765625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.30587026476860046, "epoch": 1.844736842105263, "frac_reward_zero_std": 0.46875, "grad_norm": 0.0067597865127027035, "learning_rate": 1e-06, "loss": 0.0015, "num_tokens": 180647529.0, "reward": 0.722265362739563, "reward_std": 0.12456189095973969, "rewards/progression_diversity/mean": -2.3203070668387227e-05, "rewards/progression_diversity/std": 0.0005250255926512182, "rewards/symbolic_reward_accuracy/mean": 0.73828125, "rewards/symbolic_reward_accuracy/std": 0.44000017642974854, "rewards/symbolic_reward_partial_score/mean": 0.9309896230697632, "rewards/symbolic_reward_partial_score/std": 0.12603533267974854, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0772705078125, "sampling/importance_sampling_ratio/min": 2.513166327844374e-05, "sampling/sampling_logp_difference/max": 10.591382026672363, "sampling/sampling_logp_difference/mean": 0.14627277851104736, "step": 701 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3159767836332321, "epoch": 1.8473684210526315, "grad_norm": 0.008303055539727211, "learning_rate": 1e-06, "loss": 0.0015, "step": 702 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3041051924228668, "epoch": 1.85, "grad_norm": 0.0064351242035627365, "learning_rate": 1e-06, "loss": -0.0025, "step": 703 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3142499476671219, "epoch": 1.8526315789473684, "grad_norm": 0.005516021512448788, "learning_rate": 1e-06, "loss": 0.0002, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 375.818359375, "completions/mean_terminated_length": 375.818359375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "entropy": 0.31115204095840454, "epoch": 1.8552631578947367, "frac_reward_zero_std": 0.5, "grad_norm": 0.009341200813651085, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 181254444.0, "reward": 0.6875925064086914, "reward_std": 0.09061338007450104, "rewards/progression_diversity/mean": -0.0005182477761991322, "rewards/progression_diversity/std": 0.008718207478523254, "rewards/symbolic_reward_accuracy/mean": 0.6875, "rewards/symbolic_reward_accuracy/std": 0.4639657139778137, "rewards/symbolic_reward_partial_score/mean": 0.9169921875, "rewards/symbolic_reward_partial_score/std": 0.14258471131324768, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0775392055511475, "sampling/importance_sampling_ratio/min": 0.001974229235202074, "sampling/sampling_logp_difference/max": 6.227577209472656, "sampling/sampling_logp_difference/mean": 0.14746874570846558, "step": 705 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.31405113637447357, "epoch": 1.8578947368421053, "grad_norm": 0.007201730273663998, "learning_rate": 1e-06, "loss": -0.0002, "step": 706 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31317275762557983, "epoch": 1.8605263157894738, "grad_norm": 0.0077608609572052956, "learning_rate": 1e-06, "loss": -0.0007, "step": 707 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30931928753852844, "epoch": 1.8631578947368421, "grad_norm": 0.007706182077527046, "learning_rate": 1e-06, "loss": -0.0003, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 711.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 372.041015625, "completions/mean_terminated_length": 372.041015625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.31280550360679626, "epoch": 1.8657894736842104, "frac_reward_zero_std": 0.40625, "grad_norm": 0.007203435059636831, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 181856577.0, "reward": 0.6337395906448364, "reward_std": 0.11881368607282639, "rewards/progression_diversity/mean": -7.104627729859203e-05, "rewards/progression_diversity/std": 0.0014897359069436789, "rewards/symbolic_reward_accuracy/mean": 0.615234375, "rewards/symbolic_reward_accuracy/std": 0.4870156943798065, "rewards/symbolic_reward_partial_score/mean": 0.8819986581802368, "rewards/symbolic_reward_partial_score/std": 0.17834146320819855, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078622579574585, "sampling/importance_sampling_ratio/min": 0.000638676225207746, "sampling/sampling_logp_difference/max": 7.356112957000732, "sampling/sampling_logp_difference/mean": 0.14934757351875305, "step": 709 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.31266067922115326, "epoch": 1.868421052631579, "grad_norm": 0.005193103104829788, "learning_rate": 1e-06, "loss": -0.0029, "step": 710 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.3179129362106323, "epoch": 1.8710526315789475, "grad_norm": 0.004717197269201279, "learning_rate": 1e-06, "loss": 0.0013, "step": 711 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.32021914422512054, "epoch": 1.8736842105263158, "grad_norm": 0.006537822540849447, "learning_rate": 1e-06, "loss": 0.0017, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 409.322265625, "completions/mean_terminated_length": 378.0606689453125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.3134091943502426, "epoch": 1.8763157894736842, "frac_reward_zero_std": 0.5625, "grad_norm": 0.007834019139409065, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 182449766.0, "reward": 0.7249389290809631, "reward_std": 0.09784473478794098, "rewards/progression_diversity/mean": -0.0012266990961506963, "rewards/progression_diversity/std": 0.02737962268292904, "rewards/symbolic_reward_accuracy/mean": 0.75390625, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.90869140625, "rewards/symbolic_reward_partial_score/std": 0.17205534875392914, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0749471187591553, "sampling/importance_sampling_ratio/min": 0.004973389208316803, "sampling/sampling_logp_difference/max": 5.303653717041016, "sampling/sampling_logp_difference/mean": 0.14320877194404602, "step": 713 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3065633177757263, "epoch": 1.8789473684210525, "grad_norm": 0.00554261077195406, "learning_rate": 1e-06, "loss": 0.0016, "step": 714 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.312502920627594, "epoch": 1.881578947368421, "grad_norm": 0.006236112676560879, "learning_rate": 1e-06, "loss": -0.0006, "step": 715 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31002379953861237, "epoch": 1.8842105263157896, "grad_norm": 0.0022626114077866077, "learning_rate": 1e-06, "loss": -0.0024, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 370.48046875, "completions/mean_terminated_length": 370.48046875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.31257370114326477, "epoch": 1.8868421052631579, "frac_reward_zero_std": 0.5625, "grad_norm": 0.007542754523456097, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 183043100.0, "reward": 0.6575683951377869, "reward_std": 0.09304552525281906, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.671875, "rewards/symbolic_reward_accuracy/std": 0.4699897766113281, "rewards/symbolic_reward_partial_score/mean": 0.84814453125, "rewards/symbolic_reward_partial_score/std": 0.23802095651626587, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0778599977493286, "sampling/importance_sampling_ratio/min": 0.00111155875492841, "sampling/sampling_logp_difference/max": 6.801991939544678, "sampling/sampling_logp_difference/mean": 0.14849314093589783, "step": 717 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.31163640320301056, "epoch": 1.8894736842105262, "grad_norm": 0.007604603189975023, "learning_rate": 1e-06, "loss": 0.0004, "step": 718 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3117101490497589, "epoch": 1.8921052631578947, "grad_norm": 0.007121788803488016, "learning_rate": 1e-06, "loss": -0.0024, "step": 719 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3161403089761734, "epoch": 1.8947368421052633, "grad_norm": 0.004191112704575062, "learning_rate": 1e-06, "loss": 0.0004, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 365.009765625, "completions/mean_terminated_length": 365.009765625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.3126900643110275, "epoch": 1.8973684210526316, "frac_reward_zero_std": 0.59375, "grad_norm": 0.00634095398709178, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 183639809.0, "reward": 0.6971662640571594, "reward_std": 0.07519043982028961, "rewards/progression_diversity/mean": -0.00017441027739550918, "rewards/progression_diversity/std": 0.0029865563847124577, "rewards/symbolic_reward_accuracy/mean": 0.708984375, "rewards/symbolic_reward_accuracy/std": 0.45467492938041687, "rewards/symbolic_reward_partial_score/mean": 0.9059244394302368, "rewards/symbolic_reward_partial_score/std": 0.16108950972557068, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077085018157959, "sampling/importance_sampling_ratio/min": 0.00030413156491704285, "sampling/sampling_logp_difference/max": 8.098050117492676, "sampling/sampling_logp_difference/mean": 0.1494484543800354, "step": 721 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31474678218364716, "epoch": 1.9, "grad_norm": 0.007915548980236053, "learning_rate": 1e-06, "loss": 0.0025, "step": 722 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.31036464869976044, "epoch": 1.9026315789473685, "grad_norm": 0.004422432277351618, "learning_rate": 1e-06, "loss": -0.0015, "step": 723 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3158973455429077, "epoch": 1.905263157894737, "grad_norm": 0.005758347921073437, "learning_rate": 1e-06, "loss": -0.0024, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 711.0, "completions/mean_length": 398.349609375, "completions/mean_terminated_length": 367.0665283203125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3149125427007675, "epoch": 1.9078947368421053, "frac_reward_zero_std": 0.65625, "grad_norm": 0.008096176199615002, "learning_rate": 1e-06, "loss": 0.0283, "num_tokens": 184239732.0, "reward": 0.6666892766952515, "reward_std": 0.06731939315795898, "rewards/progression_diversity/mean": -0.0009964853525161743, "rewards/progression_diversity/std": 0.0225478895008564, "rewards/symbolic_reward_accuracy/mean": 0.671875, "rewards/symbolic_reward_accuracy/std": 0.4699897766113281, "rewards/symbolic_reward_partial_score/mean": 0.8792318105697632, "rewards/symbolic_reward_partial_score/std": 0.1995687186717987, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768595933914185, "sampling/importance_sampling_ratio/min": 0.0012633983278647065, "sampling/sampling_logp_difference/max": 6.6739501953125, "sampling/sampling_logp_difference/mean": 0.14742116630077362, "step": 725 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.31597861647605896, "epoch": 1.9105263157894736, "grad_norm": 0.0031310454942286015, "learning_rate": 1e-06, "loss": -0.0004, "step": 726 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.31505411863327026, "epoch": 1.913157894736842, "grad_norm": 0.005425718612968922, "learning_rate": 1e-06, "loss": -0.0001, "step": 727 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3154520094394684, "epoch": 1.9157894736842105, "grad_norm": 0.003204651176929474, "learning_rate": 1e-06, "loss": 0.0005, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 371.125, "completions/mean_terminated_length": 371.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.31483209133148193, "epoch": 1.918421052631579, "frac_reward_zero_std": 0.5625, "grad_norm": 0.009964163415133953, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 184853588.0, "reward": 0.6192383170127869, "reward_std": 0.08837610483169556, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.603515625, "rewards/symbolic_reward_accuracy/std": 0.4896455705165863, "rewards/symbolic_reward_partial_score/mean": 0.8570963144302368, "rewards/symbolic_reward_partial_score/std": 0.21023648977279663, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0781691074371338, "sampling/importance_sampling_ratio/min": 3.291745451861061e-05, "sampling/sampling_logp_difference/max": 10.321507453918457, "sampling/sampling_logp_difference/mean": 0.14980000257492065, "step": 729 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31423668563365936, "epoch": 1.9210526315789473, "grad_norm": 0.005938328802585602, "learning_rate": 1e-06, "loss": 0.0009, "step": 730 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31109851598739624, "epoch": 1.9236842105263157, "grad_norm": 0.006710145156830549, "learning_rate": 1e-06, "loss": -0.0002, "step": 731 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.31440161168575287, "epoch": 1.9263157894736842, "grad_norm": 0.007554585114121437, "learning_rate": 1e-06, "loss": -0.0001, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 741.0, "completions/max_terminated_length": 741.0, "completions/mean_length": 358.267578125, "completions/mean_terminated_length": 358.267578125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.3080116808414459, "epoch": 1.9289473684210527, "frac_reward_zero_std": 0.53125, "grad_norm": 0.004908265545964241, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 185426653.0, "reward": 0.6898437738418579, "reward_std": 0.11878422647714615, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.693359375, "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, "rewards/symbolic_reward_partial_score/mean": 0.9134114980697632, "rewards/symbolic_reward_partial_score/std": 0.14149527251720428, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0759687423706055, "sampling/importance_sampling_ratio/min": 3.671062586363405e-05, "sampling/sampling_logp_difference/max": 10.212444305419922, "sampling/sampling_logp_difference/mean": 0.14870089292526245, "step": 733 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3127932846546173, "epoch": 1.931578947368421, "grad_norm": 0.0042357915081083775, "learning_rate": 1e-06, "loss": 0.0001, "step": 734 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3097704350948334, "epoch": 1.9342105263157894, "grad_norm": 0.003953521605581045, "learning_rate": 1e-06, "loss": 0.0009, "step": 735 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3147825747728348, "epoch": 1.936842105263158, "grad_norm": 0.00491707818582654, "learning_rate": 1e-06, "loss": 0.0014, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 364.462890625, "completions/mean_terminated_length": 364.462890625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "entropy": 0.31078068912029266, "epoch": 1.9394736842105265, "frac_reward_zero_std": 0.53125, "grad_norm": 0.008055577054619789, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 186010890.0, "reward": 0.7134765982627869, "reward_std": 0.11845691502094269, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.7265625, "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, "rewards/symbolic_reward_partial_score/mean": 0.9251301884651184, "rewards/symbolic_reward_partial_score/std": 0.137575164437294, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782480239868164, "sampling/importance_sampling_ratio/min": 0.0007956507615745068, "sampling/sampling_logp_difference/max": 7.136350154876709, "sampling/sampling_logp_difference/mean": 0.147745743393898, "step": 737 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3155370503664017, "epoch": 1.9421052631578948, "grad_norm": 0.003693908918648958, "learning_rate": 1e-06, "loss": -0.0009, "step": 738 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.30935104191303253, "epoch": 1.944736842105263, "grad_norm": 0.004344849847257137, "learning_rate": 1e-06, "loss": 0.0003, "step": 739 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.31252725422382355, "epoch": 1.9473684210526314, "grad_norm": 0.007028626743704081, "learning_rate": 1e-06, "loss": 0.0011, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 366.111328125, "completions/mean_terminated_length": 366.111328125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.3120097368955612, "epoch": 1.95, "frac_reward_zero_std": 0.53125, "grad_norm": 0.012667186558246613, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 186614499.0, "reward": 0.6177229285240173, "reward_std": 0.08512495458126068, "rewards/progression_diversity/mean": -0.00017063260020222515, "rewards/progression_diversity/std": 0.0029495148919522762, "rewards/symbolic_reward_accuracy/mean": 0.59375, "rewards/symbolic_reward_accuracy/std": 0.49161264300346375, "rewards/symbolic_reward_partial_score/mean": 0.87158203125, "rewards/symbolic_reward_partial_score/std": 0.17591506242752075, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0777539014816284, "sampling/importance_sampling_ratio/min": 0.0005583280581049621, "sampling/sampling_logp_difference/max": 7.490563869476318, "sampling/sampling_logp_difference/mean": 0.14936181902885437, "step": 741 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3076600730419159, "epoch": 1.9526315789473685, "grad_norm": 0.008351475931704044, "learning_rate": 1e-06, "loss": 0.0005, "step": 742 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.3145374208688736, "epoch": 1.9552631578947368, "grad_norm": 0.0027690737042576075, "learning_rate": 1e-06, "loss": -0.0015, "step": 743 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3131014108657837, "epoch": 1.9578947368421051, "grad_norm": 0.005030886270105839, "learning_rate": 1e-06, "loss": -0.0009, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 369.201171875, "completions/mean_terminated_length": 369.201171875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.31135545670986176, "epoch": 1.9605263157894737, "frac_reward_zero_std": 0.59375, "grad_norm": 0.008429143577814102, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 187226026.0, "reward": 0.7362792491912842, "reward_std": 0.08533239364624023, "rewards/progression_diversity/mean": -1.097430595109472e-05, "rewards/progression_diversity/std": 0.00024832019698806107, "rewards/symbolic_reward_accuracy/mean": 0.76171875, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.9308267831802368, "rewards/symbolic_reward_partial_score/std": 0.13665828108787537, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0770303010940552, "sampling/importance_sampling_ratio/min": 0.0008326321840286255, "sampling/sampling_logp_difference/max": 7.09091854095459, "sampling/sampling_logp_difference/mean": 0.14952237904071808, "step": 745 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.31318385899066925, "epoch": 1.9631578947368422, "grad_norm": 0.0033032975625246763, "learning_rate": 1e-06, "loss": -0.0005, "step": 746 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31359700858592987, "epoch": 1.9657894736842105, "grad_norm": 0.007935271598398685, "learning_rate": 1e-06, "loss": 0.0013, "step": 747 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3112269937992096, "epoch": 1.9684210526315788, "grad_norm": 0.00817700196057558, "learning_rate": 1e-06, "loss": -0.0001, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 370.04296875, "completions/mean_terminated_length": 370.04296875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.3109482377767563, "epoch": 1.9710526315789474, "frac_reward_zero_std": 0.59375, "grad_norm": 0.006165255792438984, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 187772416.0, "reward": 0.7406250238418579, "reward_std": 0.09467050433158875, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.779296875, "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, "rewards/symbolic_reward_partial_score/mean": 0.91015625, "rewards/symbolic_reward_partial_score/std": 0.1857500523328781, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076927661895752, "sampling/importance_sampling_ratio/min": 7.953165550134145e-06, "sampling/sampling_logp_difference/max": 11.74194049835205, "sampling/sampling_logp_difference/mean": 0.1476995050907135, "step": 749 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.308622270822525, "epoch": 1.973684210526316, "grad_norm": 0.0047165341675281525, "learning_rate": 1e-06, "loss": -0.0008, "step": 750 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3079015612602234, "epoch": 1.9763157894736842, "grad_norm": 0.0040458752773702145, "learning_rate": 1e-06, "loss": 0.0001, "step": 751 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.30972377955913544, "epoch": 1.9789473684210526, "grad_norm": 0.004278114996850491, "learning_rate": 1e-06, "loss": -0.0012, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 358.255859375, "completions/mean_terminated_length": 358.255859375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3107995539903641, "epoch": 1.981578947368421, "frac_reward_zero_std": 0.6875, "grad_norm": 0.007798196282237768, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 188375491.0, "reward": 0.6957031488418579, "reward_std": 0.07784873992204666, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.70703125, "rewards/symbolic_reward_accuracy/std": 0.455569326877594, "rewards/symbolic_reward_partial_score/mean": 0.9049478769302368, "rewards/symbolic_reward_partial_score/std": 0.16625335812568665, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0773422718048096, "sampling/importance_sampling_ratio/min": 0.00029975359211675823, "sampling/sampling_logp_difference/max": 8.112549781799316, "sampling/sampling_logp_difference/mean": 0.14972257614135742, "step": 753 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3143154978752136, "epoch": 1.9842105263157894, "grad_norm": 0.004069626331329346, "learning_rate": 1e-06, "loss": 0.0005, "step": 754 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.31426501274108887, "epoch": 1.986842105263158, "grad_norm": 0.0061919731087982655, "learning_rate": 1e-06, "loss": 0.0002, "step": 755 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.31135572493076324, "epoch": 1.9894736842105263, "grad_norm": 0.004079555626958609, "learning_rate": 1e-06, "loss": 0.0003, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 357.47265625, "completions/mean_terminated_length": 357.47265625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.309786781668663, "epoch": 1.9921052631578946, "frac_reward_zero_std": 0.40625, "grad_norm": 0.00555258197709918, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 188963829.0, "reward": 0.6704590320587158, "reward_std": 0.10458528995513916, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.669921875, "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, "rewards/symbolic_reward_partial_score/mean": 0.89501953125, "rewards/symbolic_reward_partial_score/std": 0.16062310338020325, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0771996974945068, "sampling/importance_sampling_ratio/min": 0.0002447458100505173, "sampling/sampling_logp_difference/max": 8.315290451049805, "sampling/sampling_logp_difference/mean": 0.14952662587165833, "step": 757 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31545260548591614, "epoch": 1.9947368421052631, "grad_norm": 0.006828597281128168, "learning_rate": 1e-06, "loss": 0.0022, "step": 758 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3046875, "entropy": 0.3086663782596588, "epoch": 1.9973684210526317, "grad_norm": 0.0067805699072778225, "learning_rate": 1e-06, "loss": 0.0012, "step": 759 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.31685784459114075, "epoch": 2.0, "grad_norm": 0.007817032746970654, "learning_rate": 1e-06, "loss": -0.0012, "step": 760 }, { "epoch": 2.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000244140625, "eval_completions/max_length": 2330.09375, "eval_completions/max_terminated_length": 1848.78125, "eval_completions/mean_length": 383.0869140625, "eval_completions/mean_terminated_length": 379.1825752258301, "eval_completions/min_length": 154.75, "eval_completions/min_terminated_length": 154.75, "eval_entropy": 0.3063309593126178, "eval_frac_reward_zero_std": 0.41015625, "eval_loss": 0.0009358798852190375, "eval_num_tokens": 188963829.0, "eval_reward": 0.7193302186205983, "eval_reward_std": 0.14116343623027205, "eval_rewards/progression_diversity/mean": -0.0005755307929575793, "eval_rewards/progression_diversity/std": 0.006511387479804398, "eval_rewards/symbolic_reward_accuracy/mean": 0.751220703125, "eval_rewards/symbolic_reward_accuracy/std": 0.40731942653656006, "eval_rewards/symbolic_reward_partial_score/mean": 0.8976237010210752, "eval_rewards/symbolic_reward_partial_score/std": 0.18608224554918706, "eval_rewards/tag_count_reward/mean": -0.0068359375, "eval_rewards/tag_count_reward/std": 0.03354312968440354, "eval_runtime": 289.381, "eval_samples_per_second": 0.864, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0764338932931423, "eval_sampling/importance_sampling_ratio/min": 0.0022319126777483106, "eval_sampling/sampling_logp_difference/max": 14.824358269572258, "eval_sampling/sampling_logp_difference/mean": 0.1495322003029287, "eval_steps_per_second": 0.007, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 359.412109375, "completions/mean_terminated_length": 359.412109375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.31213897466659546, "epoch": 2.0026315789473683, "frac_reward_zero_std": 0.75, "grad_norm": 0.009487095288932323, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 189540808.0, "reward": 0.71630859375, "reward_std": 0.050411537289619446, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.748046875, "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, "rewards/symbolic_reward_partial_score/mean": 0.8916015625, "rewards/symbolic_reward_partial_score/std": 0.19575290381908417, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0758132934570312, "sampling/importance_sampling_ratio/min": 0.0004769986553583294, "sampling/sampling_logp_difference/max": 7.64799690246582, "sampling/sampling_logp_difference/mean": 0.14864085614681244, "step": 761 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3116689920425415, "epoch": 2.0052631578947366, "grad_norm": 0.0009561034385114908, "learning_rate": 1e-06, "loss": 0.001, "step": 762 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3024158477783203, "epoch": 2.0078947368421054, "grad_norm": 0.0017013449687510729, "learning_rate": 1e-06, "loss": -0.0003, "step": 763 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3098195642232895, "epoch": 2.0105263157894737, "grad_norm": 0.0036314663011580706, "learning_rate": 1e-06, "loss": 0.0001, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 355.013671875, "completions/mean_terminated_length": 355.013671875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3051309287548065, "epoch": 2.013157894736842, "frac_reward_zero_std": 0.5625, "grad_norm": 0.005320638883858919, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 190129391.0, "reward": 0.657177746295929, "reward_std": 0.08853315562009811, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.654296875, "rewards/symbolic_reward_accuracy/std": 0.4760620892047882, "rewards/symbolic_reward_partial_score/mean": 0.8819987177848816, "rewards/symbolic_reward_partial_score/std": 0.18076345324516296, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077763557434082, "sampling/importance_sampling_ratio/min": 2.4497896447428502e-05, "sampling/sampling_logp_difference/max": 10.616923332214355, "sampling/sampling_logp_difference/mean": 0.15018996596336365, "step": 765 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.31671372056007385, "epoch": 2.0157894736842104, "grad_norm": 0.00803243275731802, "learning_rate": 1e-06, "loss": 0.0018, "step": 766 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3143162131309509, "epoch": 2.018421052631579, "grad_norm": 0.003088021883741021, "learning_rate": 1e-06, "loss": -0.0026, "step": 767 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3092113137245178, "epoch": 2.0210526315789474, "grad_norm": 0.0034267231822013855, "learning_rate": 1e-06, "loss": 0.0006, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 353.95703125, "completions/mean_terminated_length": 353.95703125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3089481443166733, "epoch": 2.0236842105263158, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008214015513658524, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 190701913.0, "reward": 0.6354489326477051, "reward_std": 0.10014986246824265, "rewards/progression_diversity/mean": -3.783680949709378e-05, "rewards/progression_diversity/std": 0.0008561493013985455, "rewards/symbolic_reward_accuracy/mean": 0.630859375, "rewards/symbolic_reward_accuracy/std": 0.4830440282821655, "rewards/symbolic_reward_partial_score/mean": 0.8564453125, "rewards/symbolic_reward_partial_score/std": 0.20011062920093536, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768274068832397, "sampling/importance_sampling_ratio/min": 0.000324864435242489, "sampling/sampling_logp_difference/max": 8.032102584838867, "sampling/sampling_logp_difference/mean": 0.14762979745864868, "step": 769 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.31103044748306274, "epoch": 2.026315789473684, "grad_norm": 0.005561685189604759, "learning_rate": 1e-06, "loss": -0.001, "step": 770 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.3203125, "entropy": 0.30797044932842255, "epoch": 2.028947368421053, "grad_norm": 0.008373531512916088, "learning_rate": 1e-06, "loss": -0.0009, "step": 771 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3142692297697067, "epoch": 2.031578947368421, "grad_norm": 0.009180448018014431, "learning_rate": 1e-06, "loss": 0.0029, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 364.009765625, "completions/mean_terminated_length": 364.009765625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "entropy": 0.3065429776906967, "epoch": 2.0342105263157895, "frac_reward_zero_std": 0.53125, "grad_norm": 0.006964081432670355, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 191298590.0, "reward": 0.6746094226837158, "reward_std": 0.09266329556703568, "rewards/progression_diversity/mean": -1.0304419220119598e-06, "rewards/progression_diversity/std": 2.331623727513943e-05, "rewards/symbolic_reward_accuracy/mean": 0.681640625, "rewards/symbolic_reward_accuracy/std": 0.46629536151885986, "rewards/symbolic_reward_partial_score/mean": 0.8854166865348816, "rewards/symbolic_reward_partial_score/std": 0.1944405734539032, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0783308744430542, "sampling/importance_sampling_ratio/min": 0.0001869058469310403, "sampling/sampling_logp_difference/max": 8.584905624389648, "sampling/sampling_logp_difference/mean": 0.14911575615406036, "step": 773 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30941206216812134, "epoch": 2.036842105263158, "grad_norm": 0.008871463127434254, "learning_rate": 1e-06, "loss": -0.0001, "step": 774 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3097645044326782, "epoch": 2.039473684210526, "grad_norm": 0.006418595090508461, "learning_rate": 1e-06, "loss": 0.0018, "step": 775 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31205175817012787, "epoch": 2.042105263157895, "grad_norm": 0.005364975426346064, "learning_rate": 1e-06, "loss": -0.0018, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 367.24609375, "completions/mean_terminated_length": 367.24609375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.31110282242298126, "epoch": 2.044736842105263, "frac_reward_zero_std": 0.5625, "grad_norm": 0.009222902357578278, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 191865724.0, "reward": 0.6914538741111755, "reward_std": 0.0986681878566742, "rewards/progression_diversity/mean": -0.00012244281242601573, "rewards/progression_diversity/std": 0.002770564751699567, "rewards/symbolic_reward_accuracy/mean": 0.69921875, "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, "rewards/symbolic_reward_partial_score/mean": 0.9064127206802368, "rewards/symbolic_reward_partial_score/std": 0.16166870296001434, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077000379562378, "sampling/importance_sampling_ratio/min": 0.00012576297740451992, "sampling/sampling_logp_difference/max": 8.981111526489258, "sampling/sampling_logp_difference/mean": 0.14813096821308136, "step": 777 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3072364181280136, "epoch": 2.0473684210526315, "grad_norm": 0.004725575912743807, "learning_rate": 1e-06, "loss": -0.0017, "step": 778 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3081643432378769, "epoch": 2.05, "grad_norm": 0.005970868747681379, "learning_rate": 1e-06, "loss": 0.002, "step": 779 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3126187026500702, "epoch": 2.0526315789473686, "grad_norm": 0.0036161758471280336, "learning_rate": 1e-06, "loss": -0.0015, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 366.17578125, "completions/mean_terminated_length": 366.17578125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.31414252519607544, "epoch": 2.055263157894737, "frac_reward_zero_std": 0.53125, "grad_norm": 0.007239960134029388, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 192478710.0, "reward": 0.7059570550918579, "reward_std": 0.1260911226272583, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.71875, "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, "rewards/symbolic_reward_partial_score/mean": 0.9163411259651184, "rewards/symbolic_reward_partial_score/std": 0.14764182269573212, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782256126403809, "sampling/importance_sampling_ratio/min": 0.0007079911883920431, "sampling/sampling_logp_difference/max": 7.253078937530518, "sampling/sampling_logp_difference/mean": 0.14842364192008972, "step": 781 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3148372173309326, "epoch": 2.057894736842105, "grad_norm": 0.005920483730733395, "learning_rate": 1e-06, "loss": -0.0006, "step": 782 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3127191811800003, "epoch": 2.0605263157894735, "grad_norm": 0.009441790170967579, "learning_rate": 1e-06, "loss": 0.001, "step": 783 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.313884437084198, "epoch": 2.0631578947368423, "grad_norm": 0.004328244365751743, "learning_rate": 1e-06, "loss": -0.0007, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 395.302734375, "completions/mean_terminated_length": 364.0137023925781, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.30200617015361786, "epoch": 2.0657894736842106, "frac_reward_zero_std": 0.53125, "grad_norm": 0.008038647472858429, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 193108273.0, "reward": 0.7076036334037781, "reward_std": 0.08745211362838745, "rewards/progression_diversity/mean": -0.0013568074209615588, "rewards/progression_diversity/std": 0.030187834054231644, "rewards/symbolic_reward_accuracy/mean": 0.734375, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.890625, "rewards/symbolic_reward_partial_score/std": 0.1928967982530594, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0760550498962402, "sampling/importance_sampling_ratio/min": 0.0006800789851695299, "sampling/sampling_logp_difference/max": 7.293301582336426, "sampling/sampling_logp_difference/mean": 0.14417186379432678, "step": 785 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.30903005599975586, "epoch": 2.068421052631579, "grad_norm": 0.00587112782523036, "learning_rate": 1e-06, "loss": 0.0031, "step": 786 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3140885829925537, "epoch": 2.0710526315789473, "grad_norm": 0.005677395034581423, "learning_rate": 1e-06, "loss": -0.0009, "step": 787 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31708140671253204, "epoch": 2.0736842105263156, "grad_norm": 0.0033428401220589876, "learning_rate": 1e-06, "loss": 0.0009, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 360.97265625, "completions/mean_terminated_length": 360.97265625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.31590431928634644, "epoch": 2.0763157894736843, "frac_reward_zero_std": 0.46875, "grad_norm": 0.007495602127164602, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 193679875.0, "reward": 0.6812500357627869, "reward_std": 0.1275385469198227, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.693359375, "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, "rewards/symbolic_reward_partial_score/mean": 0.8841146230697632, "rewards/symbolic_reward_partial_score/std": 0.1895521730184555, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0787707567214966, "sampling/importance_sampling_ratio/min": 1.7228163642357686e-06, "sampling/sampling_logp_difference/max": 13.271550178527832, "sampling/sampling_logp_difference/mean": 0.14790306985378265, "step": 789 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.3198508620262146, "epoch": 2.0789473684210527, "grad_norm": 0.007427311968058348, "learning_rate": 1e-06, "loss": -0.0006, "step": 790 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3098384737968445, "epoch": 2.081578947368421, "grad_norm": 0.004864763468503952, "learning_rate": 1e-06, "loss": 0.0011, "step": 791 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1328125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.30552271008491516, "epoch": 2.0842105263157893, "grad_norm": 0.004870227538049221, "learning_rate": 1e-06, "loss": -0.0005, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 361.841796875, "completions/mean_terminated_length": 361.841796875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.31432493031024933, "epoch": 2.086842105263158, "frac_reward_zero_std": 0.625, "grad_norm": 0.008887571282684803, "learning_rate": 1e-06, "loss": -0.0032, "num_tokens": 194278290.0, "reward": 0.6641601324081421, "reward_std": 0.06735213100910187, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.669921875, "rewards/symbolic_reward_accuracy/std": 0.47070086002349854, "rewards/symbolic_reward_partial_score/mean": 0.8740234375, "rewards/symbolic_reward_partial_score/std": 0.20001934468746185, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0777101516723633, "sampling/importance_sampling_ratio/min": 0.006325147580355406, "sampling/sampling_logp_difference/max": 5.0632219314575195, "sampling/sampling_logp_difference/mean": 0.14742319285869598, "step": 793 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.30889126658439636, "epoch": 2.0894736842105264, "grad_norm": 0.0032396079041063786, "learning_rate": 1e-06, "loss": 0.0009, "step": 794 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3082197904586792, "epoch": 2.0921052631578947, "grad_norm": 0.0014065575087442994, "learning_rate": 1e-06, "loss": 0.0012, "step": 795 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3115806132555008, "epoch": 2.094736842105263, "grad_norm": 0.007706951815634966, "learning_rate": 1e-06, "loss": 0.0007, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 359.45703125, "completions/mean_terminated_length": 359.45703125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.3137265592813492, "epoch": 2.0973684210526318, "frac_reward_zero_std": 0.59375, "grad_norm": 0.004259579814970493, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 194847612.0, "reward": 0.7145506143569946, "reward_std": 0.08891631662845612, "rewards/progression_diversity/mean": -1.562127363285981e-05, "rewards/progression_diversity/std": 0.00035346910590305924, "rewards/symbolic_reward_accuracy/mean": 0.734375, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.9130859375, "rewards/symbolic_reward_partial_score/std": 0.1638316661119461, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0777618885040283, "sampling/importance_sampling_ratio/min": 0.000564085494261235, "sampling/sampling_logp_difference/max": 7.480304718017578, "sampling/sampling_logp_difference/mean": 0.1483234167098999, "step": 797 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3083493560552597, "epoch": 2.1, "grad_norm": 0.00659141456708312, "learning_rate": 1e-06, "loss": -0.0018, "step": 798 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.314989909529686, "epoch": 2.1026315789473684, "grad_norm": 0.0037467950023710728, "learning_rate": 1e-06, "loss": 0.0022, "step": 799 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3185144364833832, "epoch": 2.1052631578947367, "grad_norm": 0.005139374174177647, "learning_rate": 1e-06, "loss": -0.001, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 365.982421875, "completions/mean_terminated_length": 365.982421875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.31436602771282196, "epoch": 2.1078947368421055, "frac_reward_zero_std": 0.625, "grad_norm": 0.008306819945573807, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 195432787.0, "reward": 0.7186523675918579, "reward_std": 0.08997929096221924, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.740234375, "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, "rewards/symbolic_reward_partial_score/mean": 0.9150390625, "rewards/symbolic_reward_partial_score/std": 0.1594066470861435, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0780589580535889, "sampling/importance_sampling_ratio/min": 0.0019471251871436834, "sampling/sampling_logp_difference/max": 6.241401195526123, "sampling/sampling_logp_difference/mean": 0.15034829080104828, "step": 801 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3124569356441498, "epoch": 2.110526315789474, "grad_norm": 0.007252343464642763, "learning_rate": 1e-06, "loss": 0.0016, "step": 802 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31769220530986786, "epoch": 2.113157894736842, "grad_norm": 0.002544126473367214, "learning_rate": 1e-06, "loss": -0.0005, "step": 803 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3188740909099579, "epoch": 2.1157894736842104, "grad_norm": 0.0020408392883837223, "learning_rate": 1e-06, "loss": -0.0006, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 761.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 355.298828125, "completions/mean_terminated_length": 355.298828125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.31769895553588867, "epoch": 2.1184210526315788, "frac_reward_zero_std": 0.53125, "grad_norm": 0.0065668681636452675, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 196035532.0, "reward": 0.694287121295929, "reward_std": 0.09877711534500122, "rewards/progression_diversity/mean": -1.5697775097578415e-06, "rewards/progression_diversity/std": 3.552000998752192e-05, "rewards/symbolic_reward_accuracy/mean": 0.70703125, "rewards/symbolic_reward_accuracy/std": 0.455569326877594, "rewards/symbolic_reward_partial_score/mean": 0.9002279043197632, "rewards/symbolic_reward_partial_score/std": 0.16993996500968933, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0784680843353271, "sampling/importance_sampling_ratio/min": 1.892145405690826e-06, "sampling/sampling_logp_difference/max": 13.177799224853516, "sampling/sampling_logp_difference/mean": 0.15134167671203613, "step": 805 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.31717509031295776, "epoch": 2.1210526315789475, "grad_norm": 0.008248434402048588, "learning_rate": 1e-06, "loss": -0.0006, "step": 806 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3211695998907089, "epoch": 2.123684210526316, "grad_norm": 0.003757715690881014, "learning_rate": 1e-06, "loss": 0.0007, "step": 807 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.32018236815929413, "epoch": 2.126315789473684, "grad_norm": 0.0037706305738538504, "learning_rate": 1e-06, "loss": -0.0013, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 356.197265625, "completions/mean_terminated_length": 356.197265625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.31526923179626465, "epoch": 2.1289473684210525, "frac_reward_zero_std": 0.5, "grad_norm": 0.007752216421067715, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 196643409.0, "reward": 0.6648437976837158, "reward_std": 0.09060361981391907, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.66796875, "rewards/symbolic_reward_accuracy/std": 0.47140273451805115, "rewards/symbolic_reward_partial_score/mean": 0.8802083134651184, "rewards/symbolic_reward_partial_score/std": 0.20505410432815552, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078608512878418, "sampling/importance_sampling_ratio/min": 0.0015027325134724379, "sampling/sampling_logp_difference/max": 6.500470161437988, "sampling/sampling_logp_difference/mean": 0.15123139321804047, "step": 809 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3221448212862015, "epoch": 2.1315789473684212, "grad_norm": 0.006944730877876282, "learning_rate": 1e-06, "loss": -0.0017, "step": 810 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3182772845029831, "epoch": 2.1342105263157896, "grad_norm": 0.009241136722266674, "learning_rate": 1e-06, "loss": 0.002, "step": 811 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3173188716173172, "epoch": 2.136842105263158, "grad_norm": 0.005788304843008518, "learning_rate": 1e-06, "loss": 0.0001, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 693.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 353.5078125, "completions/mean_terminated_length": 353.5078125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "entropy": 0.3186788260936737, "epoch": 2.139473684210526, "frac_reward_zero_std": 0.625, "grad_norm": 0.005239107180386782, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 197251413.0, "reward": 0.6026840209960938, "reward_std": 0.0501655638217926, "rewards/progression_diversity/mean": -0.00015046147746033967, "rewards/progression_diversity/std": 0.002405304927378893, "rewards/symbolic_reward_accuracy/mean": 0.578125, "rewards/symbolic_reward_accuracy/std": 0.49434176087379456, "rewards/symbolic_reward_partial_score/mean": 0.8527017831802368, "rewards/symbolic_reward_partial_score/std": 0.20470060408115387, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0792396068572998, "sampling/importance_sampling_ratio/min": 1.5721030877102748e-06, "sampling/sampling_logp_difference/max": 13.363096237182617, "sampling/sampling_logp_difference/mean": 0.1521165668964386, "step": 813 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3178333342075348, "epoch": 2.1421052631578945, "grad_norm": 0.01202054787427187, "learning_rate": 1e-06, "loss": 0.0033, "step": 814 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3216595947742462, "epoch": 2.1447368421052633, "grad_norm": 0.003936625551432371, "learning_rate": 1e-06, "loss": -0.0019, "step": 815 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.32149362564086914, "epoch": 2.1473684210526316, "grad_norm": 0.002031634794548154, "learning_rate": 1e-06, "loss": 0.0001, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 351.484375, "completions/mean_terminated_length": 351.484375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.31737402081489563, "epoch": 2.15, "frac_reward_zero_std": 0.53125, "grad_norm": 0.004952269606292248, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 197850701.0, "reward": 0.6548340320587158, "reward_std": 0.07701730728149414, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.6484375, "rewards/symbolic_reward_accuracy/std": 0.4779251217842102, "rewards/symbolic_reward_partial_score/mean": 0.8859049081802368, "rewards/symbolic_reward_partial_score/std": 0.17860201001167297, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0793912410736084, "sampling/importance_sampling_ratio/min": 0.0023158956319093704, "sampling/sampling_logp_difference/max": 6.067958831787109, "sampling/sampling_logp_difference/mean": 0.15115895867347717, "step": 817 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.31825047731399536, "epoch": 2.1526315789473682, "grad_norm": 0.007298370823264122, "learning_rate": 1e-06, "loss": 0.0026, "step": 818 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2734375, "entropy": 0.3138178437948227, "epoch": 2.155263157894737, "grad_norm": 0.002253433922305703, "learning_rate": 1e-06, "loss": 0.0004, "step": 819 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.32211922109127045, "epoch": 2.1578947368421053, "grad_norm": 0.010363497771322727, "learning_rate": 1e-06, "loss": -0.0023, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 383.619140625, "completions/mean_terminated_length": 352.3072509765625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.3100568354129791, "epoch": 2.1605263157894736, "frac_reward_zero_std": 0.53125, "grad_norm": 0.006256972439587116, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 198457098.0, "reward": 0.6989597082138062, "reward_std": 0.09930028021335602, "rewards/progression_diversity/mean": -0.0014896361390128732, "rewards/progression_diversity/std": 0.0337008498609066, "rewards/symbolic_reward_accuracy/mean": 0.7109375, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.9080404043197632, "rewards/symbolic_reward_partial_score/std": 0.16075260937213898, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0769511461257935, "sampling/importance_sampling_ratio/min": 0.00047645941958762705, "sampling/sampling_logp_difference/max": 7.649127960205078, "sampling/sampling_logp_difference/mean": 0.14793072640895844, "step": 821 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.32065755128860474, "epoch": 2.163157894736842, "grad_norm": 0.005824042018502951, "learning_rate": 1e-06, "loss": 0.0012, "step": 822 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.3172464370727539, "epoch": 2.1657894736842107, "grad_norm": 0.00658250181004405, "learning_rate": 1e-06, "loss": 0.0021, "step": 823 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.31654980778694153, "epoch": 2.168421052631579, "grad_norm": 0.008792352862656116, "learning_rate": 1e-06, "loss": -0.0005, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 363.064453125, "completions/mean_terminated_length": 363.064453125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "entropy": 0.3157438635826111, "epoch": 2.1710526315789473, "frac_reward_zero_std": 0.40625, "grad_norm": 0.008407440036535263, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 199052811.0, "reward": 0.6742187738418579, "reward_std": 0.13188248872756958, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.681640625, "rewards/symbolic_reward_accuracy/std": 0.46629536151885986, "rewards/symbolic_reward_partial_score/mean": 0.8841145634651184, "rewards/symbolic_reward_partial_score/std": 0.19204509258270264, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0799140930175781, "sampling/importance_sampling_ratio/min": 8.337887993548065e-05, "sampling/sampling_logp_difference/max": 9.392115592956543, "sampling/sampling_logp_difference/mean": 0.15036407113075256, "step": 825 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31925569474697113, "epoch": 2.1736842105263157, "grad_norm": 0.009222878143191338, "learning_rate": 1e-06, "loss": 0.0019, "step": 826 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.31960971653461456, "epoch": 2.1763157894736844, "grad_norm": 0.00924774631857872, "learning_rate": 1e-06, "loss": 0.0019, "step": 827 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3173024207353592, "epoch": 2.1789473684210527, "grad_norm": 0.005093837156891823, "learning_rate": 1e-06, "loss": -0.0008, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 358.44921875, "completions/mean_terminated_length": 358.44921875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.30984506011009216, "epoch": 2.181578947368421, "frac_reward_zero_std": 0.625, "grad_norm": 0.005993766710162163, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 199616785.0, "reward": 0.6996093988418579, "reward_std": 0.09051661938428879, "rewards/progression_diversity/mean": -9.057112038135529e-07, "rewards/progression_diversity/std": 2.0493906049523503e-05, "rewards/symbolic_reward_accuracy/mean": 0.716796875, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.8984375, "rewards/symbolic_reward_partial_score/std": 0.17569713294506073, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0780447721481323, "sampling/importance_sampling_ratio/min": 1.4085967450228054e-06, "sampling/sampling_logp_difference/max": 13.472916603088379, "sampling/sampling_logp_difference/mean": 0.15053117275238037, "step": 829 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3208106756210327, "epoch": 2.1842105263157894, "grad_norm": 0.004268263466656208, "learning_rate": 1e-06, "loss": 0.0, "step": 830 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3216129243373871, "epoch": 2.1868421052631577, "grad_norm": 0.005367336794734001, "learning_rate": 1e-06, "loss": 0.0007, "step": 831 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3205704092979431, "epoch": 2.1894736842105265, "grad_norm": 0.006977500859647989, "learning_rate": 1e-06, "loss": -0.0002, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 358.544921875, "completions/mean_terminated_length": 358.544921875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.3062279671430588, "epoch": 2.192105263157895, "frac_reward_zero_std": 0.46875, "grad_norm": 0.00700410595163703, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 200194984.0, "reward": 0.71435546875, "reward_std": 0.13384464383125305, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.74609375, "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, "rewards/symbolic_reward_partial_score/mean": 0.8889974355697632, "rewards/symbolic_reward_partial_score/std": 0.21046864986419678, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0783202648162842, "sampling/importance_sampling_ratio/min": 6.207470869412646e-05, "sampling/sampling_logp_difference/max": 9.687171936035156, "sampling/sampling_logp_difference/mean": 0.14857394993305206, "step": 833 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.31446513533592224, "epoch": 2.194736842105263, "grad_norm": 0.0064468346536159515, "learning_rate": 1e-06, "loss": -0.0011, "step": 834 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3156389594078064, "epoch": 2.1973684210526314, "grad_norm": 0.0036858366802334785, "learning_rate": 1e-06, "loss": -0.0008, "step": 835 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2421875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.296875, "entropy": 0.3129158616065979, "epoch": 2.2, "grad_norm": 0.008718104101717472, "learning_rate": 1e-06, "loss": 0.0028, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 348.08203125, "completions/mean_terminated_length": 348.08203125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.31272754073143005, "epoch": 2.2026315789473685, "frac_reward_zero_std": 0.625, "grad_norm": 0.003958904184401035, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 200761490.0, "reward": 0.73486328125, "reward_std": 0.0685553327202797, "rewards/progression_diversity/mean": -4.245833224558737e-06, "rewards/progression_diversity/std": 9.607223910279572e-05, "rewards/symbolic_reward_accuracy/mean": 0.76953125, "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, "rewards/symbolic_reward_partial_score/mean": 0.9104818105697632, "rewards/symbolic_reward_partial_score/std": 0.17350530624389648, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0795412063598633, "sampling/importance_sampling_ratio/min": 0.0001628218888072297, "sampling/sampling_logp_difference/max": 8.722853660583496, "sampling/sampling_logp_difference/mean": 0.1502777636051178, "step": 837 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3207908719778061, "epoch": 2.205263157894737, "grad_norm": 0.0027855695225298405, "learning_rate": 1e-06, "loss": -0.0001, "step": 838 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.32380297780036926, "epoch": 2.207894736842105, "grad_norm": 0.001798739074729383, "learning_rate": 1e-06, "loss": -0.001, "step": 839 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3216434121131897, "epoch": 2.2105263157894735, "grad_norm": 0.0038593984209001064, "learning_rate": 1e-06, "loss": 0.0015, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 352.498046875, "completions/mean_terminated_length": 352.498046875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.321834996342659, "epoch": 2.213157894736842, "frac_reward_zero_std": 0.65625, "grad_norm": 0.011081775650382042, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 201336753.0, "reward": 0.712841808795929, "reward_std": 0.07380041480064392, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.736328125, "rewards/symbolic_reward_accuracy/std": 0.4410543739795685, "rewards/symbolic_reward_partial_score/mean": 0.9034830331802368, "rewards/symbolic_reward_partial_score/std": 0.194377139210701, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0791889429092407, "sampling/importance_sampling_ratio/min": 0.00019401576719246805, "sampling/sampling_logp_difference/max": 8.547571182250977, "sampling/sampling_logp_difference/mean": 0.15164169669151306, "step": 841 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.3151762932538986, "epoch": 2.2157894736842105, "grad_norm": 0.0049751391634345055, "learning_rate": 1e-06, "loss": -0.0003, "step": 842 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.32177719473838806, "epoch": 2.218421052631579, "grad_norm": 0.0013347979402169585, "learning_rate": 1e-06, "loss": 0.001, "step": 843 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3246469497680664, "epoch": 2.221052631578947, "grad_norm": 0.0021147457882761955, "learning_rate": 1e-06, "loss": 0.0002, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 353.75, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "entropy": 0.3158084601163864, "epoch": 2.223684210526316, "frac_reward_zero_std": 0.75, "grad_norm": 0.00637836754322052, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 201900305.0, "reward": 0.7346680164337158, "reward_std": 0.060360029339790344, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.765625, "rewards/symbolic_reward_accuracy/std": 0.42402184009552, "rewards/symbolic_reward_partial_score/mean": 0.9176432490348816, "rewards/symbolic_reward_partial_score/std": 0.16336961090564728, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.078678011894226, "sampling/importance_sampling_ratio/min": 2.9706257009820547e-06, "sampling/sampling_logp_difference/max": 12.726737976074219, "sampling/sampling_logp_difference/mean": 0.1496877670288086, "step": 845 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3176430016756058, "epoch": 2.2263157894736842, "grad_norm": 0.004808385390788317, "learning_rate": 1e-06, "loss": 0.0016, "step": 846 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.31751298904418945, "epoch": 2.2289473684210526, "grad_norm": 0.0020348995458334684, "learning_rate": 1e-06, "loss": -0.0012, "step": 847 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.31586088240146637, "epoch": 2.231578947368421, "grad_norm": 0.004024218767881393, "learning_rate": 1e-06, "loss": -0.0001, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 352.365234375, "completions/mean_terminated_length": 352.365234375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.3219304084777832, "epoch": 2.2342105263157896, "frac_reward_zero_std": 0.4375, "grad_norm": 0.00704569835215807, "learning_rate": 1e-06, "loss": 0.0028, "num_tokens": 202484204.0, "reward": 0.6749019026756287, "reward_std": 0.10070345550775528, "rewards/progression_diversity/mean": -4.508948768489063e-05, "rewards/progression_diversity/std": 0.0006228564889170229, "rewards/symbolic_reward_accuracy/mean": 0.685546875, "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, "rewards/symbolic_reward_partial_score/mean": 0.8785807490348816, "rewards/symbolic_reward_partial_score/std": 0.20275656878948212, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0787829160690308, "sampling/importance_sampling_ratio/min": 1.0576643944659736e-05, "sampling/sampling_logp_difference/max": 11.456862449645996, "sampling/sampling_logp_difference/mean": 0.15037262439727783, "step": 849 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.31789323687553406, "epoch": 2.236842105263158, "grad_norm": 0.008588818833231926, "learning_rate": 1e-06, "loss": -0.0007, "step": 850 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.31510041654109955, "epoch": 2.2394736842105263, "grad_norm": 0.00736332219094038, "learning_rate": 1e-06, "loss": -0.002, "step": 851 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.3124866932630539, "epoch": 2.2421052631578946, "grad_norm": 0.005612007807940245, "learning_rate": 1e-06, "loss": 0.0009, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 361.51953125, "completions/mean_terminated_length": 361.51953125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "entropy": 0.32218506932258606, "epoch": 2.2447368421052634, "frac_reward_zero_std": 0.59375, "grad_norm": 0.013904748484492302, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 203052758.0, "reward": 0.6032225489616394, "reward_std": 0.08952237665653229, "rewards/progression_diversity/mean": -1.4995790479588322e-05, "rewards/progression_diversity/std": 0.000339316000463441, "rewards/symbolic_reward_accuracy/mean": 0.587890625, "rewards/symbolic_reward_accuracy/std": 0.49269601702690125, "rewards/symbolic_reward_partial_score/mean": 0.8349609375, "rewards/symbolic_reward_partial_score/std": 0.21583224833011627, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0778234004974365, "sampling/importance_sampling_ratio/min": 5.763072863373964e-07, "sampling/sampling_logp_difference/max": 14.36662483215332, "sampling/sampling_logp_difference/mean": 0.15012940764427185, "step": 853 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3153466731309891, "epoch": 2.2473684210526317, "grad_norm": 0.0023524423595517874, "learning_rate": 1e-06, "loss": 0.0007, "step": 854 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.31378166377544403, "epoch": 2.25, "grad_norm": 0.0045723277144134045, "learning_rate": 1e-06, "loss": -0.0003, "step": 855 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.31204673647880554, "epoch": 2.2526315789473683, "grad_norm": 0.006419648882001638, "learning_rate": 1e-06, "loss": 0.0005, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 364.69921875, "completions/mean_terminated_length": 364.69921875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.3136000484228134, "epoch": 2.2552631578947366, "frac_reward_zero_std": 0.59375, "grad_norm": 0.00942289736121893, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 203652796.0, "reward": 0.6517090201377869, "reward_std": 0.07550329715013504, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.64453125, "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, "rewards/symbolic_reward_partial_score/mean": 0.88330078125, "rewards/symbolic_reward_partial_score/std": 0.18055777251720428, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0787506103515625, "sampling/importance_sampling_ratio/min": 5.0096670747734606e-05, "sampling/sampling_logp_difference/max": 9.901556015014648, "sampling/sampling_logp_difference/mean": 0.1502964198589325, "step": 857 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3159327059984207, "epoch": 2.2578947368421054, "grad_norm": 0.0035984772257506847, "learning_rate": 1e-06, "loss": 0.001, "step": 858 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3201962113380432, "epoch": 2.2605263157894737, "grad_norm": 0.002598309423774481, "learning_rate": 1e-06, "loss": -0.0014, "step": 859 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3207338899374008, "epoch": 2.263157894736842, "grad_norm": 0.0067030079662799835, "learning_rate": 1e-06, "loss": 0.0022, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 362.994140625, "completions/mean_terminated_length": 362.994140625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.31222720444202423, "epoch": 2.2657894736842104, "frac_reward_zero_std": 0.65625, "grad_norm": 0.007842420600354671, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 204248633.0, "reward": 0.675830066204071, "reward_std": 0.05877920240163803, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.6796875, "rewards/symbolic_reward_accuracy/std": 0.4670529365539551, "rewards/symbolic_reward_partial_score/mean": 0.8933919668197632, "rewards/symbolic_reward_partial_score/std": 0.17396998405456543, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0796515941619873, "sampling/importance_sampling_ratio/min": 0.00020882970420643687, "sampling/sampling_logp_difference/max": 8.473991394042969, "sampling/sampling_logp_difference/mean": 0.14983773231506348, "step": 861 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.31324464082717896, "epoch": 2.268421052631579, "grad_norm": 0.008384945802390575, "learning_rate": 1e-06, "loss": -0.0017, "step": 862 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31786786019802094, "epoch": 2.2710526315789474, "grad_norm": 0.002380169229581952, "learning_rate": 1e-06, "loss": 0.0025, "step": 863 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.31832440197467804, "epoch": 2.2736842105263158, "grad_norm": 0.004632322117686272, "learning_rate": 1e-06, "loss": -0.0003, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 369.892578125, "completions/mean_terminated_length": 369.892578125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "entropy": 0.3140920400619507, "epoch": 2.276315789473684, "frac_reward_zero_std": 0.5, "grad_norm": 0.005593942478299141, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 204799298.0, "reward": 0.6139160394668579, "reward_std": 0.10376207530498505, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.58984375, "rewards/symbolic_reward_accuracy/std": 0.49234291911125183, "rewards/symbolic_reward_partial_score/mean": 0.86669921875, "rewards/symbolic_reward_partial_score/std": 0.19208206236362457, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0795059204101562, "sampling/importance_sampling_ratio/min": 0.0054131243377923965, "sampling/sampling_logp_difference/max": 5.218928813934326, "sampling/sampling_logp_difference/mean": 0.14964452385902405, "step": 865 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31835804879665375, "epoch": 2.2789473684210524, "grad_norm": 0.0054872226901352406, "learning_rate": 1e-06, "loss": 0.0006, "step": 866 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.31307150423526764, "epoch": 2.281578947368421, "grad_norm": 0.008840472437441349, "learning_rate": 1e-06, "loss": 0.0003, "step": 867 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.31860506534576416, "epoch": 2.2842105263157895, "grad_norm": 0.003689047647640109, "learning_rate": 1e-06, "loss": -0.0004, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 400.73046875, "completions/mean_terminated_length": 369.4520568847656, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.3164720982313156, "epoch": 2.286842105263158, "frac_reward_zero_std": 0.59375, "grad_norm": 0.00662199268117547, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 205400760.0, "reward": 0.7893416881561279, "reward_std": 0.0898476168513298, "rewards/progression_diversity/mean": -0.0013872169656679034, "rewards/progression_diversity/std": 0.031389135867357254, "rewards/symbolic_reward_accuracy/mean": 0.84375, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.9436848759651184, "rewards/symbolic_reward_partial_score/std": 0.14467516541481018, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0766570568084717, "sampling/importance_sampling_ratio/min": 0.00467439740896225, "sampling/sampling_logp_difference/max": 5.365654945373535, "sampling/sampling_logp_difference/mean": 0.14632970094680786, "step": 869 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3118371516466141, "epoch": 2.2894736842105265, "grad_norm": 0.003031267784535885, "learning_rate": 1e-06, "loss": -0.0035, "step": 870 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.30880677700042725, "epoch": 2.292105263157895, "grad_norm": 0.005558013450354338, "learning_rate": 1e-06, "loss": 0.0001, "step": 871 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.3143739849328995, "epoch": 2.294736842105263, "grad_norm": 0.008641637861728668, "learning_rate": 1e-06, "loss": 0.0009, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 399.84765625, "completions/mean_terminated_length": 368.5675048828125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3094482719898224, "epoch": 2.2973684210526315, "frac_reward_zero_std": 0.5625, "grad_norm": 0.008527515456080437, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 206010794.0, "reward": 0.6527195572853088, "reward_std": 0.08901824057102203, "rewards/progression_diversity/mean": -0.001486002467572689, "rewards/progression_diversity/std": 0.03277655318379402, "rewards/symbolic_reward_accuracy/mean": 0.64453125, "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, "rewards/symbolic_reward_partial_score/mean": 0.88671875, "rewards/symbolic_reward_partial_score/std": 0.17725172638893127, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076696753501892, "sampling/importance_sampling_ratio/min": 3.0586368211515946e-06, "sampling/sampling_logp_difference/max": 12.697541236877441, "sampling/sampling_logp_difference/mean": 0.1453593373298645, "step": 873 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.311574786901474, "epoch": 2.3, "grad_norm": 0.004662012215703726, "learning_rate": 1e-06, "loss": 0.0006, "step": 874 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3119223415851593, "epoch": 2.3026315789473686, "grad_norm": 0.005506505724042654, "learning_rate": 1e-06, "loss": -0.0014, "step": 875 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.308918759226799, "epoch": 2.305263157894737, "grad_norm": 0.007730038370937109, "learning_rate": 1e-06, "loss": 0.0186, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 371.2109375, "completions/mean_terminated_length": 371.2109375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.3163398802280426, "epoch": 2.307894736842105, "frac_reward_zero_std": 0.3125, "grad_norm": 0.0073974900878965855, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 206607350.0, "reward": 0.6987792253494263, "reward_std": 0.1293921172618866, "rewards/progression_diversity/mean": -1.3751643564319238e-05, "rewards/progression_diversity/std": 0.0003111641854047775, "rewards/symbolic_reward_accuracy/mean": 0.71875, "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, "rewards/symbolic_reward_partial_score/mean": 0.8917642831802368, "rewards/symbolic_reward_partial_score/std": 0.1961551159620285, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0803439617156982, "sampling/importance_sampling_ratio/min": 0.004385761916637421, "sampling/sampling_logp_difference/max": 5.429391860961914, "sampling/sampling_logp_difference/mean": 0.1513446569442749, "step": 877 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.265625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3198176920413971, "epoch": 2.3105263157894735, "grad_norm": 0.004171326756477356, "learning_rate": 1e-06, "loss": -0.0021, "step": 878 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2265625, "clip_ratio/low_mean": 0.1484375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.375, "entropy": 0.31697770953178406, "epoch": 2.3131578947368423, "grad_norm": 0.01108588743954897, "learning_rate": 1e-06, "loss": 0.0061, "step": 879 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.2578125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.328125, "entropy": 0.3188299834728241, "epoch": 2.3157894736842106, "grad_norm": 0.007227355148643255, "learning_rate": 1e-06, "loss": 0.0004, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 361.14453125, "completions/mean_terminated_length": 361.14453125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3121223598718643, "epoch": 2.318421052631579, "frac_reward_zero_std": 0.65625, "grad_norm": 0.011712749488651752, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 207172704.0, "reward": 0.6546875238418579, "reward_std": 0.07038979977369308, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.6484375, "rewards/symbolic_reward_accuracy/std": 0.4779251217842102, "rewards/symbolic_reward_partial_score/mean": 0.8854166269302368, "rewards/symbolic_reward_partial_score/std": 0.17847900092601776, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0788229703903198, "sampling/importance_sampling_ratio/min": 6.657434278167784e-05, "sampling/sampling_logp_difference/max": 9.617191314697266, "sampling/sampling_logp_difference/mean": 0.15001147985458374, "step": 881 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31704777479171753, "epoch": 2.3210526315789473, "grad_norm": 0.0035065063275396824, "learning_rate": 1e-06, "loss": -0.0018, "step": 882 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3097846359014511, "epoch": 2.3236842105263156, "grad_norm": 0.005223968997597694, "learning_rate": 1e-06, "loss": -0.0001, "step": 883 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3145690858364105, "epoch": 2.3263157894736843, "grad_norm": 0.007196424994617701, "learning_rate": 1e-06, "loss": -0.0002, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 366.859375, "completions/mean_terminated_length": 366.859375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.31969161331653595, "epoch": 2.3289473684210527, "frac_reward_zero_std": 0.65625, "grad_norm": 0.005184574518352747, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 207759832.0, "reward": 0.7319824695587158, "reward_std": 0.10044944286346436, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.755859375, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.92822265625, "rewards/symbolic_reward_partial_score/std": 0.14612270891666412, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0796695947647095, "sampling/importance_sampling_ratio/min": 2.608057229736005e-06, "sampling/sampling_logp_difference/max": 12.856904983520508, "sampling/sampling_logp_difference/mean": 0.15157514810562134, "step": 885 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.31721703708171844, "epoch": 2.331578947368421, "grad_norm": 0.005828100256621838, "learning_rate": 1e-06, "loss": -0.0003, "step": 886 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.31899966299533844, "epoch": 2.3342105263157893, "grad_norm": 0.0027556861750781536, "learning_rate": 1e-06, "loss": 0.0004, "step": 887 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31219226121902466, "epoch": 2.336842105263158, "grad_norm": 0.004852119833230972, "learning_rate": 1e-06, "loss": -0.0011, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 870.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 368.1484375, "completions/mean_terminated_length": 368.1484375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.31981049478054047, "epoch": 2.3394736842105264, "frac_reward_zero_std": 0.6875, "grad_norm": 0.007404921110719442, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 208337956.0, "reward": 0.6898437738418579, "reward_std": 0.07387437671422958, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.703125, "rewards/symbolic_reward_accuracy/std": 0.45732781291007996, "rewards/symbolic_reward_partial_score/mean": 0.8932291269302368, "rewards/symbolic_reward_partial_score/std": 0.190824955701828, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0800843238830566, "sampling/importance_sampling_ratio/min": 5.298529504216276e-05, "sampling/sampling_logp_difference/max": 9.84549617767334, "sampling/sampling_logp_difference/mean": 0.1499912440776825, "step": 889 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3165969252586365, "epoch": 2.3421052631578947, "grad_norm": 0.005117423832416534, "learning_rate": 1e-06, "loss": 0.0007, "step": 890 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.31549713015556335, "epoch": 2.344736842105263, "grad_norm": 0.004304162692278624, "learning_rate": 1e-06, "loss": 0.0003, "step": 891 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.31113120913505554, "epoch": 2.3473684210526318, "grad_norm": 0.009535240940749645, "learning_rate": 1e-06, "loss": -0.0017, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 364.076171875, "completions/mean_terminated_length": 364.076171875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "entropy": 0.31199419498443604, "epoch": 2.35, "frac_reward_zero_std": 0.71875, "grad_norm": 0.006672441493719816, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 208898955.0, "reward": 0.7187011241912842, "reward_std": 0.07201861590147018, "rewards/progression_diversity/mean": -5.167676135897636e-06, "rewards/progression_diversity/std": 0.00011693117266986519, "rewards/symbolic_reward_accuracy/mean": 0.744140625, "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, "rewards/symbolic_reward_partial_score/mean": 0.9073893427848816, "rewards/symbolic_reward_partial_score/std": 0.16758772730827332, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0769355297088623, "sampling/importance_sampling_ratio/min": 8.111090892271022e-07, "sampling/sampling_logp_difference/max": 14.024863243103027, "sampling/sampling_logp_difference/mean": 0.14778931438922882, "step": 893 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3058849573135376, "epoch": 2.3526315789473684, "grad_norm": 0.004412407986819744, "learning_rate": 1e-06, "loss": -0.0006, "step": 894 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.30890876054763794, "epoch": 2.3552631578947367, "grad_norm": 0.0028338998090475798, "learning_rate": 1e-06, "loss": -0.0014, "step": 895 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3096920996904373, "epoch": 2.3578947368421055, "grad_norm": 0.008265499956905842, "learning_rate": 1e-06, "loss": 0.0015, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 364.822265625, "completions/mean_terminated_length": 364.822265625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3139221966266632, "epoch": 2.360526315789474, "frac_reward_zero_std": 0.53125, "grad_norm": 0.005665302742272615, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 209503248.0, "reward": 0.5989741683006287, "reward_std": 0.10015460103750229, "rewards/progression_diversity/mean": -4.6549830585718155e-05, "rewards/progression_diversity/std": 0.001053302432410419, "rewards/symbolic_reward_accuracy/mean": 0.572265625, "rewards/symbolic_reward_accuracy/std": 0.4952339828014374, "rewards/symbolic_reward_partial_score/mean": 0.85205078125, "rewards/symbolic_reward_partial_score/std": 0.20019729435443878, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077416181564331, "sampling/importance_sampling_ratio/min": 0.00034949558903463185, "sampling/sampling_logp_difference/max": 7.959019660949707, "sampling/sampling_logp_difference/mean": 0.1495225429534912, "step": 897 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.31885823607444763, "epoch": 2.363157894736842, "grad_norm": 0.006921331398189068, "learning_rate": 1e-06, "loss": 0.0006, "step": 898 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.30479341745376587, "epoch": 2.3657894736842104, "grad_norm": 0.007010175846517086, "learning_rate": 1e-06, "loss": 0.0008, "step": 899 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3115847110748291, "epoch": 2.3684210526315788, "grad_norm": 0.004461990669369698, "learning_rate": 1e-06, "loss": -0.0011, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 358.376953125, "completions/mean_terminated_length": 358.376953125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "entropy": 0.3171516954898834, "epoch": 2.3710526315789475, "frac_reward_zero_std": 0.625, "grad_norm": 0.007584826089441776, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 210096721.0, "reward": 0.6747558116912842, "reward_std": 0.06001054495573044, "rewards/progression_diversity/mean": -1.2740825695800595e-05, "rewards/progression_diversity/std": 0.0002882919798139483, "rewards/symbolic_reward_accuracy/mean": 0.685546875, "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, "rewards/symbolic_reward_partial_score/mean": 0.8780924081802368, "rewards/symbolic_reward_partial_score/std": 0.2119402289390564, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0778324604034424, "sampling/importance_sampling_ratio/min": 1.234491810464533e-05, "sampling/sampling_logp_difference/max": 11.302266120910645, "sampling/sampling_logp_difference/mean": 0.14901325106620789, "step": 901 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3117760568857193, "epoch": 2.373684210526316, "grad_norm": 0.004368501249700785, "learning_rate": 1e-06, "loss": 0.0021, "step": 902 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.30777251720428467, "epoch": 2.376315789473684, "grad_norm": 0.0013312747469171882, "learning_rate": 1e-06, "loss": -0.0025, "step": 903 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31223200261592865, "epoch": 2.3789473684210525, "grad_norm": 0.005191898439079523, "learning_rate": 1e-06, "loss": -0.0008, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 387.095703125, "completions/mean_terminated_length": 355.7906188964844, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.30744390189647675, "epoch": 2.3815789473684212, "frac_reward_zero_std": 0.71875, "grad_norm": 0.008135558106005192, "learning_rate": 1e-06, "loss": 0.029, "num_tokens": 210698722.0, "reward": 0.6670783758163452, "reward_std": 0.05189436301589012, "rewards/progression_diversity/mean": -0.0011522338027134538, "rewards/progression_diversity/std": 0.026072077453136444, "rewards/symbolic_reward_accuracy/mean": 0.6640625, "rewards/symbolic_reward_accuracy/std": 0.4727790653705597, "rewards/symbolic_reward_partial_score/mean": 0.8955078125, "rewards/symbolic_reward_partial_score/std": 0.1667071133852005, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0773719549179077, "sampling/importance_sampling_ratio/min": 4.233967047184706e-05, "sampling/sampling_logp_difference/max": 10.069786071777344, "sampling/sampling_logp_difference/mean": 0.1469884216785431, "step": 905 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3112915903329849, "epoch": 2.3842105263157896, "grad_norm": 0.0049289437010884285, "learning_rate": 1e-06, "loss": -0.0012, "step": 906 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.3160187900066376, "epoch": 2.386842105263158, "grad_norm": 0.006295245606452227, "learning_rate": 1e-06, "loss": -0.0017, "step": 907 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.31758415699005127, "epoch": 2.389473684210526, "grad_norm": 0.0024000697303563356, "learning_rate": 1e-06, "loss": 0.0009, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 14656.0, "completions/max_terminated_length": 14656.0, "completions/mean_length": 387.697265625, "completions/mean_terminated_length": 387.697265625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.307947114109993, "epoch": 2.3921052631578945, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0064180451445281506, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 211330567.0, "reward": 0.6816225051879883, "reward_std": 0.10136310756206512, "rewards/progression_diversity/mean": -0.0018144649220630527, "rewards/progression_diversity/std": 0.04105665162205696, "rewards/symbolic_reward_accuracy/mean": 0.685546875, "rewards/symbolic_reward_accuracy/std": 0.4647517800331116, "rewards/symbolic_reward_partial_score/mean": 0.9016927480697632, "rewards/symbolic_reward_partial_score/std": 0.17035500705242157, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0767887830734253, "sampling/importance_sampling_ratio/min": 3.258802098571323e-05, "sampling/sampling_logp_difference/max": 10.331565856933594, "sampling/sampling_logp_difference/mean": 0.14458686113357544, "step": 909 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.31323176622390747, "epoch": 2.3947368421052633, "grad_norm": 0.0073518105782568455, "learning_rate": 1e-06, "loss": 0.0002, "step": 910 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.30697983503341675, "epoch": 2.3973684210526316, "grad_norm": 0.006437649950385094, "learning_rate": 1e-06, "loss": 0.0251, "step": 911 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3090358376502991, "epoch": 2.4, "grad_norm": 0.00426133070141077, "learning_rate": 1e-06, "loss": 0.0005, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 665.0, "completions/max_terminated_length": 665.0, "completions/mean_length": 364.94921875, "completions/mean_terminated_length": 364.94921875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.30903904139995575, "epoch": 2.4026315789473682, "frac_reward_zero_std": 0.5625, "grad_norm": 0.007032964378595352, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 211941421.0, "reward": 0.6682119965553284, "reward_std": 0.07401406019926071, "rewards/progression_diversity/mean": -9.134229185292497e-05, "rewards/progression_diversity/std": 0.002066840184852481, "rewards/symbolic_reward_accuracy/mean": 0.681640625, "rewards/symbolic_reward_accuracy/std": 0.46629536151885986, "rewards/symbolic_reward_partial_score/mean": 0.8640950918197632, "rewards/symbolic_reward_partial_score/std": 0.22005455195903778, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07720947265625, "sampling/importance_sampling_ratio/min": 0.0013609235174953938, "sampling/sampling_logp_difference/max": 6.5995917320251465, "sampling/sampling_logp_difference/mean": 0.14810317754745483, "step": 913 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3084554970264435, "epoch": 2.405263157894737, "grad_norm": 0.0028849325608462095, "learning_rate": 1e-06, "loss": -0.0001, "step": 914 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3103426396846771, "epoch": 2.4078947368421053, "grad_norm": 0.005803754087537527, "learning_rate": 1e-06, "loss": 0.0017, "step": 915 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3055001199245453, "epoch": 2.4105263157894736, "grad_norm": 0.0037809647619724274, "learning_rate": 1e-06, "loss": -0.0003, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 709.0, "completions/max_terminated_length": 709.0, "completions/mean_length": 358.556640625, "completions/mean_terminated_length": 358.556640625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3065197467803955, "epoch": 2.413157894736842, "frac_reward_zero_std": 0.53125, "grad_norm": 0.009809279814362526, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 212527306.0, "reward": 0.706250011920929, "reward_std": 0.08705538511276245, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.720703125, "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, "rewards/symbolic_reward_partial_score/mean": 0.9134114980697632, "rewards/symbolic_reward_partial_score/std": 0.15249691903591156, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0773168802261353, "sampling/importance_sampling_ratio/min": 0.0006119803292676806, "sampling/sampling_logp_difference/max": 7.398810386657715, "sampling/sampling_logp_difference/mean": 0.14674416184425354, "step": 917 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3042609840631485, "epoch": 2.4157894736842107, "grad_norm": 0.008614557795226574, "learning_rate": 1e-06, "loss": 0.0015, "step": 918 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.3085095137357712, "epoch": 2.418421052631579, "grad_norm": 0.004739607684314251, "learning_rate": 1e-06, "loss": 0.0002, "step": 919 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.30383746325969696, "epoch": 2.4210526315789473, "grad_norm": 0.0061707510612905025, "learning_rate": 1e-06, "loss": -0.0004, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 355.4609375, "completions/mean_terminated_length": 355.4609375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.3117644786834717, "epoch": 2.4236842105263157, "frac_reward_zero_std": 0.625, "grad_norm": 0.008792375214397907, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 213128470.0, "reward": 0.6263182163238525, "reward_std": 0.07272778451442719, "rewards/progression_diversity/mean": -1.6492393115186132e-05, "rewards/progression_diversity/std": 0.0003731802280526608, "rewards/symbolic_reward_accuracy/mean": 0.611328125, "rewards/symbolic_reward_accuracy/std": 0.4879252314567566, "rewards/symbolic_reward_partial_score/mean": 0.8650715947151184, "rewards/symbolic_reward_partial_score/std": 0.18900856375694275, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0778659582138062, "sampling/importance_sampling_ratio/min": 0.00012650190910790116, "sampling/sampling_logp_difference/max": 8.975253105163574, "sampling/sampling_logp_difference/mean": 0.14732477068901062, "step": 921 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.3068103492259979, "epoch": 2.4263157894736844, "grad_norm": 0.005262767896056175, "learning_rate": 1e-06, "loss": -0.0003, "step": 922 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3068346828222275, "epoch": 2.4289473684210527, "grad_norm": 0.006233863532543182, "learning_rate": 1e-06, "loss": -0.0001, "step": 923 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.30970314145088196, "epoch": 2.431578947368421, "grad_norm": 0.004028057213872671, "learning_rate": 1e-06, "loss": 0.0004, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 364.669921875, "completions/mean_terminated_length": 364.669921875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.3095943033695221, "epoch": 2.4342105263157894, "frac_reward_zero_std": 0.6875, "grad_norm": 0.009729484096169472, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 213728813.0, "reward": 0.7836426496505737, "reward_std": 0.08076313883066177, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.9363607168197632, "rewards/symbolic_reward_partial_score/std": 0.15720035135746002, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0774470567703247, "sampling/importance_sampling_ratio/min": 7.029231028354843e-07, "sampling/sampling_logp_difference/max": 14.168018341064453, "sampling/sampling_logp_difference/mean": 0.1473565697669983, "step": 925 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3090842068195343, "epoch": 2.4368421052631577, "grad_norm": 0.007715750485658646, "learning_rate": 1e-06, "loss": 0.0001, "step": 926 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3084794133901596, "epoch": 2.4394736842105265, "grad_norm": 0.005060678347945213, "learning_rate": 1e-06, "loss": -0.0017, "step": 927 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3028125911951065, "epoch": 2.442105263157895, "grad_norm": 0.007612347137182951, "learning_rate": 1e-06, "loss": 0.0007, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 355.70703125, "completions/mean_terminated_length": 355.70703125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.31409741938114166, "epoch": 2.444736842105263, "frac_reward_zero_std": 0.75, "grad_norm": 0.0024258929770439863, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 214313399.0, "reward": 0.6506836414337158, "reward_std": 0.04020765423774719, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.634765625, "rewards/symbolic_reward_accuracy/std": 0.4819667339324951, "rewards/symbolic_reward_partial_score/mean": 0.8994140625, "rewards/symbolic_reward_partial_score/std": 0.15110193192958832, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0788359642028809, "sampling/importance_sampling_ratio/min": 0.001144380308687687, "sampling/sampling_logp_difference/max": 6.772891998291016, "sampling/sampling_logp_difference/mean": 0.14749541878700256, "step": 929 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.31147317588329315, "epoch": 2.4473684210526314, "grad_norm": 0.0049695298075675964, "learning_rate": 1e-06, "loss": -0.0002, "step": 930 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.30958548188209534, "epoch": 2.45, "grad_norm": 0.002006982918828726, "learning_rate": 1e-06, "loss": 0.0007, "step": 931 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.30286461114883423, "epoch": 2.4526315789473685, "grad_norm": 0.00774997565895319, "learning_rate": 1e-06, "loss": -0.0002, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 356.365234375, "completions/mean_terminated_length": 356.365234375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.31697480380535126, "epoch": 2.455263157894737, "frac_reward_zero_std": 0.6875, "grad_norm": 0.008618047460913658, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 214924690.0, "reward": 0.6026855707168579, "reward_std": 0.07293908298015594, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.58984375, "rewards/symbolic_reward_accuracy/std": 0.49234291911125183, "rewards/symbolic_reward_partial_score/mean": 0.8292643427848816, "rewards/symbolic_reward_partial_score/std": 0.22252783179283142, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0781829357147217, "sampling/importance_sampling_ratio/min": 0.0011154379462823272, "sampling/sampling_logp_difference/max": 6.798508167266846, "sampling/sampling_logp_difference/mean": 0.14927701652050018, "step": 933 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.30709315836429596, "epoch": 2.457894736842105, "grad_norm": 0.0025286353193223476, "learning_rate": 1e-06, "loss": 0.0001, "step": 934 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.3124760091304779, "epoch": 2.4605263157894735, "grad_norm": 0.006756810937076807, "learning_rate": 1e-06, "loss": -0.0006, "step": 935 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.31513579189777374, "epoch": 2.463157894736842, "grad_norm": 0.0030686540994793177, "learning_rate": 1e-06, "loss": -0.0002, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 354.447265625, "completions/mean_terminated_length": 354.447265625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3113676607608795, "epoch": 2.4657894736842105, "frac_reward_zero_std": 0.46875, "grad_norm": 0.007624417077749968, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 215512823.0, "reward": 0.6218748092651367, "reward_std": 0.11250467598438263, "rewards/progression_diversity/mean": -2.46087020059349e-05, "rewards/progression_diversity/std": 0.0005568313645198941, "rewards/symbolic_reward_accuracy/mean": 0.615234375, "rewards/symbolic_reward_accuracy/std": 0.4870156943798065, "rewards/symbolic_reward_partial_score/mean": 0.8424479365348816, "rewards/symbolic_reward_partial_score/std": 0.2174653708934784, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0776278972625732, "sampling/importance_sampling_ratio/min": 2.9739478577539558e-06, "sampling/sampling_logp_difference/max": 12.72562026977539, "sampling/sampling_logp_difference/mean": 0.14833664894104004, "step": 937 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.31023257970809937, "epoch": 2.468421052631579, "grad_norm": 0.007213308941572905, "learning_rate": 1e-06, "loss": -0.0005, "step": 938 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3112625777721405, "epoch": 2.4710526315789476, "grad_norm": 0.006306259427219629, "learning_rate": 1e-06, "loss": 0.0015, "step": 939 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.3071421980857849, "epoch": 2.473684210526316, "grad_norm": 0.007035477552562952, "learning_rate": 1e-06, "loss": -0.0002, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 350.392578125, "completions/mean_terminated_length": 350.392578125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.3151005804538727, "epoch": 2.4763157894736842, "frac_reward_zero_std": 0.59375, "grad_norm": 0.010886289179325104, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 216085184.0, "reward": 0.7312005758285522, "reward_std": 0.06138945370912552, "rewards/progression_diversity/mean": -6.468767242040485e-05, "rewards/progression_diversity/std": 0.0014637151034548879, "rewards/symbolic_reward_accuracy/mean": 0.759765625, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.9178059697151184, "rewards/symbolic_reward_partial_score/std": 0.16282695531845093, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0770689249038696, "sampling/importance_sampling_ratio/min": 2.975442839669995e-06, "sampling/sampling_logp_difference/max": 12.725117683410645, "sampling/sampling_logp_difference/mean": 0.14930102229118347, "step": 941 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3122284412384033, "epoch": 2.4789473684210526, "grad_norm": 0.00767452223226428, "learning_rate": 1e-06, "loss": -0.0009, "step": 942 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.31038355827331543, "epoch": 2.481578947368421, "grad_norm": 0.0039905463345348835, "learning_rate": 1e-06, "loss": 0.001, "step": 943 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3104354441165924, "epoch": 2.4842105263157896, "grad_norm": 0.006206648889929056, "learning_rate": 1e-06, "loss": -0.0003, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 349.552734375, "completions/mean_terminated_length": 349.552734375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.307844340801239, "epoch": 2.486842105263158, "frac_reward_zero_std": 0.59375, "grad_norm": 0.005118571221828461, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 216655451.0, "reward": 0.6357910633087158, "reward_std": 0.08715735375881195, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.626953125, "rewards/symbolic_reward_accuracy/std": 0.48408737778663635, "rewards/symbolic_reward_partial_score/mean": 0.8653971552848816, "rewards/symbolic_reward_partial_score/std": 0.18656498193740845, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076431155204773, "sampling/importance_sampling_ratio/min": 2.6057361424136616e-07, "sampling/sampling_logp_difference/max": 15.160380363464355, "sampling/sampling_logp_difference/mean": 0.1468237042427063, "step": 945 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30096369981765747, "epoch": 2.4894736842105263, "grad_norm": 0.008310060016810894, "learning_rate": 1e-06, "loss": 0.0045, "step": 946 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3050964027643204, "epoch": 2.4921052631578946, "grad_norm": 0.006288197357207537, "learning_rate": 1e-06, "loss": -0.0023, "step": 947 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.3067568242549896, "epoch": 2.4947368421052634, "grad_norm": 0.008080433122813702, "learning_rate": 1e-06, "loss": -0.0019, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 352.97265625, "completions/mean_terminated_length": 352.97265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.30862340331077576, "epoch": 2.4973684210526317, "frac_reward_zero_std": 0.65625, "grad_norm": 0.002505539683625102, "learning_rate": 1e-06, "loss": 0.0017, "num_tokens": 217229293.0, "reward": 0.7032715082168579, "reward_std": 0.05813654884696007, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.716796875, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.91064453125, "rewards/symbolic_reward_partial_score/std": 0.16002444922924042, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0768301486968994, "sampling/importance_sampling_ratio/min": 0.000333755393512547, "sampling/sampling_logp_difference/max": 8.005102157592773, "sampling/sampling_logp_difference/mean": 0.14894793927669525, "step": 949 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.310753732919693, "epoch": 2.5, "grad_norm": 0.002883767941966653, "learning_rate": 1e-06, "loss": -0.0008, "step": 950 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3066103011369705, "epoch": 2.5026315789473683, "grad_norm": 0.0024179958272725344, "learning_rate": 1e-06, "loss": 0.001, "step": 951 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.3090369999408722, "epoch": 2.5052631578947366, "grad_norm": 0.0026873471215367317, "learning_rate": 1e-06, "loss": -0.001, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 380.767578125, "completions/mean_terminated_length": 349.4501037597656, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.3078317791223526, "epoch": 2.5078947368421054, "frac_reward_zero_std": 0.625, "grad_norm": 0.005132244899868965, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 217815542.0, "reward": 0.7311402559280396, "reward_std": 0.08192440122365952, "rewards/progression_diversity/mean": -0.0012174799339845777, "rewards/progression_diversity/std": 0.026926402002573013, "rewards/symbolic_reward_accuracy/mean": 0.7578125, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.9215494394302368, "rewards/symbolic_reward_partial_score/std": 0.15562941133975983, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0754692554473877, "sampling/importance_sampling_ratio/min": 2.2065607936383458e-06, "sampling/sampling_logp_difference/max": 13.024075508117676, "sampling/sampling_logp_difference/mean": 0.1453796923160553, "step": 953 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.31123657524585724, "epoch": 2.5105263157894737, "grad_norm": 0.003947115037590265, "learning_rate": 1e-06, "loss": -0.0001, "step": 954 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3104862570762634, "epoch": 2.513157894736842, "grad_norm": 0.002622234169393778, "learning_rate": 1e-06, "loss": 0.028, "step": 955 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.30933481454849243, "epoch": 2.515789473684211, "grad_norm": 0.003103440860286355, "learning_rate": 1e-06, "loss": 0.0002, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 344.740234375, "completions/mean_terminated_length": 344.740234375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3079196810722351, "epoch": 2.518421052631579, "frac_reward_zero_std": 0.625, "grad_norm": 0.004070614464581013, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 218374161.0, "reward": 0.7833008170127869, "reward_std": 0.07904037088155746, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.83203125, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.9469400644302368, "rewards/symbolic_reward_partial_score/std": 0.13283653557300568, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0777184963226318, "sampling/importance_sampling_ratio/min": 7.50339386286214e-05, "sampling/sampling_logp_difference/max": 9.497570037841797, "sampling/sampling_logp_difference/mean": 0.14970475435256958, "step": 957 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3124520182609558, "epoch": 2.5210526315789474, "grad_norm": 0.003689341014251113, "learning_rate": 1e-06, "loss": -0.0009, "step": 958 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.3111405521631241, "epoch": 2.5236842105263158, "grad_norm": 0.0028974253218621016, "learning_rate": 1e-06, "loss": 0.0002, "step": 959 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3061821907758713, "epoch": 2.526315789473684, "grad_norm": 0.005521416198462248, "learning_rate": 1e-06, "loss": 0.0004, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 346.318359375, "completions/mean_terminated_length": 346.318359375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.3132731467485428, "epoch": 2.5289473684210524, "frac_reward_zero_std": 0.59375, "grad_norm": 0.004160603508353233, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 218967796.0, "reward": 0.6078125238418579, "reward_std": 0.07761308550834656, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.58984375, "rewards/symbolic_reward_accuracy/std": 0.49234291911125183, "rewards/symbolic_reward_partial_score/mean": 0.8463541269302368, "rewards/symbolic_reward_partial_score/std": 0.21938246488571167, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782113075256348, "sampling/importance_sampling_ratio/min": 0.0008083023130893707, "sampling/sampling_logp_difference/max": 7.120574474334717, "sampling/sampling_logp_difference/mean": 0.14856785535812378, "step": 961 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30516865849494934, "epoch": 2.531578947368421, "grad_norm": 0.006756477057933807, "learning_rate": 1e-06, "loss": -0.0012, "step": 962 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3062412440776825, "epoch": 2.5342105263157895, "grad_norm": 0.0025619908701628447, "learning_rate": 1e-06, "loss": 0.0008, "step": 963 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3087068498134613, "epoch": 2.536842105263158, "grad_norm": 0.0025542299263179302, "learning_rate": 1e-06, "loss": 0.0016, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 348.484375, "completions/mean_terminated_length": 348.484375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.30975644290447235, "epoch": 2.5394736842105265, "frac_reward_zero_std": 0.75, "grad_norm": 0.00557674840092659, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 219543692.0, "reward": 0.653564453125, "reward_std": 0.044423237442970276, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.646484375, "rewards/symbolic_reward_accuracy/std": 0.47852855920791626, "rewards/symbolic_reward_partial_score/mean": 0.8855794668197632, "rewards/symbolic_reward_partial_score/std": 0.1733708679676056, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0785977840423584, "sampling/importance_sampling_ratio/min": 0.0008315914310514927, "sampling/sampling_logp_difference/max": 7.092169284820557, "sampling/sampling_logp_difference/mean": 0.14907319843769073, "step": 965 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3097044676542282, "epoch": 2.542105263157895, "grad_norm": 0.002045287750661373, "learning_rate": 1e-06, "loss": -0.0001, "step": 966 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.31358230113983154, "epoch": 2.544736842105263, "grad_norm": 0.007611148990690708, "learning_rate": 1e-06, "loss": -0.0017, "step": 967 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.31234195828437805, "epoch": 2.5473684210526315, "grad_norm": 0.006483261939138174, "learning_rate": 1e-06, "loss": 0.0017, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 348.541015625, "completions/mean_terminated_length": 348.541015625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.310483381152153, "epoch": 2.55, "frac_reward_zero_std": 0.71875, "grad_norm": 0.003465034533292055, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 220121281.0, "reward": 0.6914063096046448, "reward_std": 0.060588110238313675, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.70703125, "rewards/symbolic_reward_accuracy/std": 0.455569326877594, "rewards/symbolic_reward_partial_score/mean": 0.890625, "rewards/symbolic_reward_partial_score/std": 0.18579120934009552, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0780143737792969, "sampling/importance_sampling_ratio/min": 0.00011447574070189148, "sampling/sampling_logp_difference/max": 9.07514762878418, "sampling/sampling_logp_difference/mean": 0.1497250497341156, "step": 969 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.30818693339824677, "epoch": 2.5526315789473686, "grad_norm": 0.0023296221625059843, "learning_rate": 1e-06, "loss": -0.0017, "step": 970 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3092123121023178, "epoch": 2.555263157894737, "grad_norm": 0.007309067994356155, "learning_rate": 1e-06, "loss": 0.0022, "step": 971 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3103838115930557, "epoch": 2.557894736842105, "grad_norm": 0.002875859383493662, "learning_rate": 1e-06, "loss": -0.0005, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 341.330078125, "completions/mean_terminated_length": 341.330078125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.30619047582149506, "epoch": 2.5605263157894735, "frac_reward_zero_std": 0.6875, "grad_norm": 0.009937925264239311, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 220689162.0, "reward": 0.7670894861221313, "reward_std": 0.06431086361408234, "rewards/progression_diversity/mean": -3.514744821586646e-05, "rewards/progression_diversity/std": 0.0007952958694659173, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.9436848759651184, "rewards/symbolic_reward_partial_score/std": 0.12426012754440308, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.077713966369629, "sampling/importance_sampling_ratio/min": 0.002786138793453574, "sampling/sampling_logp_difference/max": 5.883098602294922, "sampling/sampling_logp_difference/mean": 0.14811402559280396, "step": 973 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.31514872610569, "epoch": 2.5631578947368423, "grad_norm": 0.002241847338154912, "learning_rate": 1e-06, "loss": -0.0015, "step": 974 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3097695857286453, "epoch": 2.5657894736842106, "grad_norm": 0.006119919009506702, "learning_rate": 1e-06, "loss": -0.0006, "step": 975 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.306579053401947, "epoch": 2.568421052631579, "grad_norm": 0.008558162488043308, "learning_rate": 1e-06, "loss": 0.0013, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 340.4921875, "completions/mean_terminated_length": 340.4921875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.3134208023548126, "epoch": 2.5710526315789473, "frac_reward_zero_std": 0.6875, "grad_norm": 0.007445094641298056, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 221250118.0, "reward": 0.6376464366912842, "reward_std": 0.05477411672472954, "rewards/progression_diversity/mean": -7.944680874061305e-06, "rewards/progression_diversity/std": 0.00017976762319449335, "rewards/symbolic_reward_accuracy/mean": 0.62109375, "rewards/symbolic_reward_accuracy/std": 0.4855891764163971, "rewards/symbolic_reward_partial_score/mean": 0.88330078125, "rewards/symbolic_reward_partial_score/std": 0.16613253951072693, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782685279846191, "sampling/importance_sampling_ratio/min": 0.0026720340829342604, "sampling/sampling_logp_difference/max": 5.924915313720703, "sampling/sampling_logp_difference/mean": 0.15010234713554382, "step": 977 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.31288468837738037, "epoch": 2.5736842105263156, "grad_norm": 0.003037536283954978, "learning_rate": 1e-06, "loss": -0.0005, "step": 978 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3096996992826462, "epoch": 2.5763157894736843, "grad_norm": 0.0053139738738536835, "learning_rate": 1e-06, "loss": -0.0005, "step": 979 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3134246915578842, "epoch": 2.5789473684210527, "grad_norm": 0.004795863758772612, "learning_rate": 1e-06, "loss": 0.0011, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 336.384765625, "completions/mean_terminated_length": 336.384765625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3105523884296417, "epoch": 2.581578947368421, "frac_reward_zero_std": 0.75, "grad_norm": 0.003965785726904869, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 221824331.0, "reward": 0.74609375, "reward_std": 0.05215379595756531, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.787109375, "rewards/symbolic_reward_accuracy/std": 0.409751296043396, "rewards/symbolic_reward_partial_score/mean": 0.9127604365348816, "rewards/symbolic_reward_partial_score/std": 0.17551816999912262, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0778260231018066, "sampling/importance_sampling_ratio/min": 0.0006920514279045165, "sampling/sampling_logp_difference/max": 7.275850296020508, "sampling/sampling_logp_difference/mean": 0.1488214135169983, "step": 981 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.30993202328681946, "epoch": 2.5842105263157897, "grad_norm": 0.0053926375694572926, "learning_rate": 1e-06, "loss": -0.0008, "step": 982 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.30970150232315063, "epoch": 2.586842105263158, "grad_norm": 0.0028977948240935802, "learning_rate": 1e-06, "loss": -0.0006, "step": 983 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.30929918587207794, "epoch": 2.5894736842105264, "grad_norm": 0.005668803583830595, "learning_rate": 1e-06, "loss": 0.0006, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 344.669921875, "completions/mean_terminated_length": 344.669921875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.3082158714532852, "epoch": 2.5921052631578947, "frac_reward_zero_std": 0.78125, "grad_norm": 0.004866783507168293, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 222418626.0, "reward": 0.6892090439796448, "reward_std": 0.04069463908672333, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.705078125, "rewards/symbolic_reward_accuracy/std": 0.4564536213874817, "rewards/symbolic_reward_partial_score/mean": 0.88720703125, "rewards/symbolic_reward_partial_score/std": 0.20415787398815155, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0783774852752686, "sampling/importance_sampling_ratio/min": 0.004361480474472046, "sampling/sampling_logp_difference/max": 5.434943675994873, "sampling/sampling_logp_difference/mean": 0.1484195590019226, "step": 985 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.30943770706653595, "epoch": 2.594736842105263, "grad_norm": 0.00852410402148962, "learning_rate": 1e-06, "loss": -0.001, "step": 986 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.30831897258758545, "epoch": 2.5973684210526313, "grad_norm": 0.003436718601733446, "learning_rate": 1e-06, "loss": -0.0003, "step": 987 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.3049241006374359, "epoch": 2.6, "grad_norm": 0.007047669496387243, "learning_rate": 1e-06, "loss": -0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 341.205078125, "completions/mean_terminated_length": 341.205078125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.31349438428878784, "epoch": 2.6026315789473684, "frac_reward_zero_std": 0.71875, "grad_norm": 0.004920005798339844, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 222963083.0, "reward": 0.7357420921325684, "reward_std": 0.031880468130111694, "rewards/progression_diversity/mean": -1.5017576515674591e-05, "rewards/progression_diversity/std": 0.00033980896114371717, "rewards/symbolic_reward_accuracy/mean": 0.771484375, "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, "rewards/symbolic_reward_partial_score/mean": 0.9095052480697632, "rewards/symbolic_reward_partial_score/std": 0.17970143258571625, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0788280963897705, "sampling/importance_sampling_ratio/min": 0.0024846631567925215, "sampling/sampling_logp_difference/max": 5.997618198394775, "sampling/sampling_logp_difference/mean": 0.15029466152191162, "step": 989 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3132941573858261, "epoch": 2.6052631578947367, "grad_norm": 0.005506443325430155, "learning_rate": 1e-06, "loss": 0.0004, "step": 990 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.31322436034679413, "epoch": 2.6078947368421055, "grad_norm": 0.0013167249271646142, "learning_rate": 1e-06, "loss": 0.0002, "step": 991 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.31497417390346527, "epoch": 2.610526315789474, "grad_norm": 0.003209513844922185, "learning_rate": 1e-06, "loss": -0.0014, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 339.025390625, "completions/mean_terminated_length": 339.025390625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.3111710846424103, "epoch": 2.613157894736842, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0031757066026329994, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 223535640.0, "reward": 0.7018066644668579, "reward_std": 0.05514729022979736, "rewards/progression_diversity/mean": -2.5508261387585662e-06, "rewards/progression_diversity/std": 5.771860742243007e-05, "rewards/symbolic_reward_accuracy/mean": 0.71875, "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, "rewards/symbolic_reward_partial_score/mean": 0.90185546875, "rewards/symbolic_reward_partial_score/std": 0.1845003068447113, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0782794952392578, "sampling/importance_sampling_ratio/min": 0.0020101333502680063, "sampling/sampling_logp_difference/max": 6.209554195404053, "sampling/sampling_logp_difference/mean": 0.1487007737159729, "step": 993 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3111952245235443, "epoch": 2.6157894736842104, "grad_norm": 0.00311102787964046, "learning_rate": 1e-06, "loss": -0.0022, "step": 994 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.3100218176841736, "epoch": 2.6184210526315788, "grad_norm": 0.008853343315422535, "learning_rate": 1e-06, "loss": 0.0002, "step": 995 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.30624426901340485, "epoch": 2.6210526315789475, "grad_norm": 0.00726295355707407, "learning_rate": 1e-06, "loss": 0.0009, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 338.2734375, "completions/mean_terminated_length": 338.2734375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.3080877661705017, "epoch": 2.623684210526316, "frac_reward_zero_std": 0.6875, "grad_norm": 0.005093069281429052, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 224128164.0, "reward": 0.6446288824081421, "reward_std": 0.07044163346290588, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.634765625, "rewards/symbolic_reward_accuracy/std": 0.4819667339324951, "rewards/symbolic_reward_partial_score/mean": 0.8792317509651184, "rewards/symbolic_reward_partial_score/std": 0.18442124128341675, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0780870914459229, "sampling/importance_sampling_ratio/min": 0.001496361568570137, "sampling/sampling_logp_difference/max": 6.504718780517578, "sampling/sampling_logp_difference/mean": 0.1492321938276291, "step": 997 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3098251223564148, "epoch": 2.626315789473684, "grad_norm": 0.003775182878598571, "learning_rate": 1e-06, "loss": -0.0012, "step": 998 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.30809755623340607, "epoch": 2.6289473684210525, "grad_norm": 0.005265187006443739, "learning_rate": 1e-06, "loss": 0.0007, "step": 999 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3101976066827774, "epoch": 2.6315789473684212, "grad_norm": 0.00613139383494854, "learning_rate": 1e-06, "loss": 0.0005, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 343.287109375, "completions/mean_terminated_length": 343.287109375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.31136079132556915, "epoch": 2.6342105263157896, "frac_reward_zero_std": 0.53125, "grad_norm": 0.0061224219389259815, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 224689047.0, "reward": 0.6758301258087158, "reward_std": 0.10899586975574493, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.677734375, "rewards/symbolic_reward_accuracy/std": 0.46780112385749817, "rewards/symbolic_reward_partial_score/mean": 0.8972982168197632, "rewards/symbolic_reward_partial_score/std": 0.16858573257923126, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.07841956615448, "sampling/importance_sampling_ratio/min": 0.0017329779220744967, "sampling/sampling_logp_difference/max": 6.357913970947266, "sampling/sampling_logp_difference/mean": 0.1503494828939438, "step": 1001 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.3088783025741577, "epoch": 2.636842105263158, "grad_norm": 0.0070188590325415134, "learning_rate": 1e-06, "loss": -0.0011, "step": 1002 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30825355648994446, "epoch": 2.639473684210526, "grad_norm": 0.003072237130254507, "learning_rate": 1e-06, "loss": -0.0007, "step": 1003 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.109375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.31032636761665344, "epoch": 2.6421052631578945, "grad_norm": 0.0050218356773257256, "learning_rate": 1e-06, "loss": 0.0011, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.30877159535884857, "epoch": 2.6447368421052633, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0041364701464772224, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 225278327.0, "reward": 0.7000486850738525, "reward_std": 0.07234279066324234, "rewards/progression_diversity/mean": -1.8634102161740884e-05, "rewards/progression_diversity/std": 0.0004216416273266077, "rewards/symbolic_reward_accuracy/mean": 0.708984375, "rewards/symbolic_reward_accuracy/std": 0.45467492938041687, "rewards/symbolic_reward_partial_score/mean": 0.91552734375, "rewards/symbolic_reward_partial_score/std": 0.15014779567718506, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0777002573013306, "sampling/importance_sampling_ratio/min": 0.001144947949796915, "sampling/sampling_logp_difference/max": 6.772396087646484, "sampling/sampling_logp_difference/mean": 0.14913979172706604, "step": 1005 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3107386380434036, "epoch": 2.6473684210526316, "grad_norm": 0.005770625080913305, "learning_rate": 1e-06, "loss": -0.0007, "step": 1006 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3138565123081207, "epoch": 2.65, "grad_norm": 0.0046647596172988415, "learning_rate": 1e-06, "loss": -0.0003, "step": 1007 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3070129156112671, "epoch": 2.6526315789473687, "grad_norm": 0.01046669203788042, "learning_rate": 1e-06, "loss": 0.0017, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 341.783203125, "completions/mean_terminated_length": 341.783203125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.3066733479499817, "epoch": 2.655263157894737, "frac_reward_zero_std": 0.71875, "grad_norm": 0.006327769719064236, "learning_rate": 1e-06, "loss": -0.0013, "num_tokens": 225847624.0, "reward": 0.6906728148460388, "reward_std": 0.05062294378876686, "rewards/progression_diversity/mean": -0.0001056864857673645, "rewards/progression_diversity/std": 0.00239141215570271, "rewards/symbolic_reward_accuracy/mean": 0.70703125, "rewards/symbolic_reward_accuracy/std": 0.455569326877594, "rewards/symbolic_reward_partial_score/mean": 0.88818359375, "rewards/symbolic_reward_partial_score/std": 0.20053695142269135, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0784711837768555, "sampling/importance_sampling_ratio/min": 0.0028838764410465956, "sampling/sampling_logp_difference/max": 5.8486199378967285, "sampling/sampling_logp_difference/mean": 0.14774033427238464, "step": 1009 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.3070971816778183, "epoch": 2.6578947368421053, "grad_norm": 0.007127484772354364, "learning_rate": 1e-06, "loss": -0.0004, "step": 1010 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3089192360639572, "epoch": 2.6605263157894736, "grad_norm": 0.0016588810831308365, "learning_rate": 1e-06, "loss": 0.0008, "step": 1011 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3057337701320648, "epoch": 2.663157894736842, "grad_norm": 0.006851482670754194, "learning_rate": 1e-06, "loss": 0.0008, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14683.0, "completions/mean_length": 406.533203125, "completions/mean_terminated_length": 375.2661437988281, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.3020845800638199, "epoch": 2.6657894736842103, "frac_reward_zero_std": 0.65625, "grad_norm": 0.008201662451028824, "learning_rate": 1e-06, "loss": 0.0252, "num_tokens": 226443897.0, "reward": 0.721895158290863, "reward_std": 0.05035192146897316, "rewards/progression_diversity/mean": -0.00287051172927022, "rewards/progression_diversity/std": 0.04405975714325905, "rewards/symbolic_reward_accuracy/mean": 0.748046875, "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, "rewards/symbolic_reward_partial_score/mean": 0.9109700322151184, "rewards/symbolic_reward_partial_score/std": 0.17348289489746094, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.073678970336914, "sampling/importance_sampling_ratio/min": 0.0020475396886467934, "sampling/sampling_logp_difference/max": 6.1911163330078125, "sampling/sampling_logp_difference/mean": 0.14129182696342468, "step": 1013 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.30159105360507965, "epoch": 2.668421052631579, "grad_norm": 0.004705225117504597, "learning_rate": 1e-06, "loss": 0.0296, "step": 1014 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3036012649536133, "epoch": 2.6710526315789473, "grad_norm": 0.002584053436294198, "learning_rate": 1e-06, "loss": -0.0018, "step": 1015 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.3087976425886154, "epoch": 2.6736842105263157, "grad_norm": 0.0061477916315197945, "learning_rate": 1e-06, "loss": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 344.1484375, "completions/mean_terminated_length": 344.1484375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.3104751855134964, "epoch": 2.6763157894736844, "frac_reward_zero_std": 0.65625, "grad_norm": 0.00948393065482378, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 226990885.0, "reward": 0.7349108457565308, "reward_std": 0.06460164487361908, "rewards/progression_diversity/mean": -0.0001266201288672164, "rewards/progression_diversity/std": 0.0028650863096117973, "rewards/symbolic_reward_accuracy/mean": 0.76171875, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.92626953125, "rewards/symbolic_reward_partial_score/std": 0.14307039976119995, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0778229236602783, "sampling/importance_sampling_ratio/min": 0.000825743016321212, "sampling/sampling_logp_difference/max": 7.099226951599121, "sampling/sampling_logp_difference/mean": 0.15059536695480347, "step": 1017 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.309643492102623, "epoch": 2.6789473684210527, "grad_norm": 0.0018985444912686944, "learning_rate": 1e-06, "loss": 0.0002, "step": 1018 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3100026994943619, "epoch": 2.681578947368421, "grad_norm": 0.0010688561014831066, "learning_rate": 1e-06, "loss": -0.0002, "step": 1019 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.30169081687927246, "epoch": 2.6842105263157894, "grad_norm": 0.006834503263235092, "learning_rate": 1e-06, "loss": 0.0004, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 351.275390625, "completions/mean_terminated_length": 351.275390625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.30858996510505676, "epoch": 2.6868421052631577, "frac_reward_zero_std": 0.65625, "grad_norm": 0.007892133668065071, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 227591730.0, "reward": 0.7020506858825684, "reward_std": 0.07354225218296051, "rewards/progression_diversity/mean": -1.1801354048657231e-05, "rewards/progression_diversity/std": 0.0002670341345947236, "rewards/symbolic_reward_accuracy/mean": 0.7109375, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.9182942509651184, "rewards/symbolic_reward_partial_score/std": 0.14587374031543732, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0771903991699219, "sampling/importance_sampling_ratio/min": 0.0010955760953947902, "sampling/sampling_logp_difference/max": 6.816474914550781, "sampling/sampling_logp_difference/mean": 0.14943185448646545, "step": 1021 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.30647024512290955, "epoch": 2.6894736842105265, "grad_norm": 0.0044861165806651115, "learning_rate": 1e-06, "loss": 0.0001, "step": 1022 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.30685843527317047, "epoch": 2.692105263157895, "grad_norm": 0.0015153492568060756, "learning_rate": 1e-06, "loss": 0.0008, "step": 1023 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.31040677428245544, "epoch": 2.694736842105263, "grad_norm": 0.0017616256373003125, "learning_rate": 1e-06, "loss": -0.0014, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 345.9140625, "completions/mean_terminated_length": 345.9140625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.29469770193099976, "epoch": 2.6973684210526314, "frac_reward_zero_std": 0.5625, "grad_norm": 0.00477286521345377, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 228152454.0, "reward": 0.7686523795127869, "reward_std": 0.113402359187603, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.9176432490348816, "rewards/symbolic_reward_partial_score/std": 0.18666434288024902, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0759270191192627, "sampling/importance_sampling_ratio/min": 0.0006319702370092273, "sampling/sampling_logp_difference/max": 7.366668224334717, "sampling/sampling_logp_difference/mean": 0.14713361859321594, "step": 1025 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.30262984335422516, "epoch": 2.7, "grad_norm": 0.007038114592432976, "learning_rate": 1e-06, "loss": 0.003, "step": 1026 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.30532851815223694, "epoch": 2.7026315789473685, "grad_norm": 0.006812415551394224, "learning_rate": 1e-06, "loss": -0.0003, "step": 1027 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.30250802636146545, "epoch": 2.705263157894737, "grad_norm": 0.002279542852193117, "learning_rate": 1e-06, "loss": -0.0014, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 342.79296875, "completions/mean_terminated_length": 342.79296875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.30581048130989075, "epoch": 2.707894736842105, "frac_reward_zero_std": 0.53125, "grad_norm": 0.011115007102489471, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 228731932.0, "reward": 0.6307129263877869, "reward_std": 0.0981566533446312, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.630859375, "rewards/symbolic_reward_accuracy/std": 0.4830440282821655, "rewards/symbolic_reward_partial_score/mean": 0.8406575918197632, "rewards/symbolic_reward_partial_score/std": 0.23888207972049713, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0764038562774658, "sampling/importance_sampling_ratio/min": 0.0026295094285160303, "sampling/sampling_logp_difference/max": 5.940958023071289, "sampling/sampling_logp_difference/mean": 0.14800997078418732, "step": 1029 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.1171875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.30258332192897797, "epoch": 2.7105263157894735, "grad_norm": 0.004817832726985216, "learning_rate": 1e-06, "loss": -0.0003, "step": 1030 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.140625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.30187736451625824, "epoch": 2.713157894736842, "grad_norm": 0.00844305194914341, "learning_rate": 1e-06, "loss": 0.0002, "step": 1031 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.1015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.3083288073539734, "epoch": 2.7157894736842105, "grad_norm": 0.004226917400956154, "learning_rate": 1e-06, "loss": -0.0015, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 348.5625, "completions/mean_terminated_length": 348.5625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "entropy": 0.30741892755031586, "epoch": 2.718421052631579, "frac_reward_zero_std": 0.59375, "grad_norm": 0.005195413250476122, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 229322044.0, "reward": 0.5599607229232788, "reward_std": 0.08515633642673492, "rewards/progression_diversity/mean": -2.2997846826910973e-05, "rewards/progression_diversity/std": 0.0005203818436712027, "rewards/symbolic_reward_accuracy/mean": 0.5078125, "rewards/symbolic_reward_accuracy/std": 0.5004279017448425, "rewards/symbolic_reward_partial_score/mean": 0.8509114384651184, "rewards/symbolic_reward_partial_score/std": 0.1921633780002594, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0783910751342773, "sampling/importance_sampling_ratio/min": 0.00017082311387639493, "sampling/sampling_logp_difference/max": 8.674881935119629, "sampling/sampling_logp_difference/mean": 0.1485639214515686, "step": 1033 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.30229929089546204, "epoch": 2.7210526315789476, "grad_norm": 0.005828053690493107, "learning_rate": 1e-06, "loss": 0.0002, "step": 1034 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3022471219301224, "epoch": 2.723684210526316, "grad_norm": 0.002931118942797184, "learning_rate": 1e-06, "loss": -0.0002, "step": 1035 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3109671622514725, "epoch": 2.7263157894736842, "grad_norm": 0.0031470481771975756, "learning_rate": 1e-06, "loss": 0.0006, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 342.962890625, "completions/mean_terminated_length": 342.962890625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "entropy": 0.3021458685398102, "epoch": 2.7289473684210526, "frac_reward_zero_std": 0.78125, "grad_norm": 0.00692404480651021, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 229923145.0, "reward": 0.698925793170929, "reward_std": 0.05597582459449768, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.716796875, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.8961588144302368, "rewards/symbolic_reward_partial_score/std": 0.17904885113239288, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.076554298400879, "sampling/importance_sampling_ratio/min": 0.0001375319843646139, "sampling/sampling_logp_difference/max": 8.891654014587402, "sampling/sampling_logp_difference/mean": 0.1469387412071228, "step": 1037 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.30062244832515717, "epoch": 2.731578947368421, "grad_norm": 0.008866215124726295, "learning_rate": 1e-06, "loss": 0.0004, "step": 1038 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.30110302567481995, "epoch": 2.734210526315789, "grad_norm": 0.002656978787854314, "learning_rate": 1e-06, "loss": -0.0012, "step": 1039 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.30177319049835205, "epoch": 2.736842105263158, "grad_norm": 0.0029455830808728933, "learning_rate": 1e-06, "loss": -0.0005, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 347.380859375, "completions/mean_terminated_length": 347.380859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.307174950838089, "epoch": 2.7394736842105263, "frac_reward_zero_std": 0.75, "grad_norm": 0.006679740268737078, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 230498796.0, "reward": 0.7644531726837158, "reward_std": 0.03681304678320885, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.80859375, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.931640625, "rewards/symbolic_reward_partial_score/std": 0.15179350972175598, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0770549774169922, "sampling/importance_sampling_ratio/min": 0.00020917513757012784, "sampling/sampling_logp_difference/max": 8.472338676452637, "sampling/sampling_logp_difference/mean": 0.14685845375061035, "step": 1041 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2970338463783264, "epoch": 2.7421052631578946, "grad_norm": 0.0014519651886075735, "learning_rate": 1e-06, "loss": -0.0013, "step": 1042 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.30320054292678833, "epoch": 2.7447368421052634, "grad_norm": 0.008143628016114235, "learning_rate": 1e-06, "loss": 0.0008, "step": 1043 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.30637988448143005, "epoch": 2.7473684210526317, "grad_norm": 0.0031961810309439898, "learning_rate": 1e-06, "loss": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 605.0, "completions/mean_length": 370.67578125, "completions/mean_terminated_length": 339.33856201171875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.2997948229312897, "epoch": 2.75, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0036635242868214846, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 231087718.0, "reward": 0.6814371943473816, "reward_std": 0.06737735867500305, "rewards/progression_diversity/mean": -0.000813078077044338, "rewards/progression_diversity/std": 0.018397856503725052, "rewards/symbolic_reward_accuracy/mean": 0.701171875, "rewards/symbolic_reward_accuracy/std": 0.45819199085235596, "rewards/symbolic_reward_partial_score/mean": 0.8697916865348816, "rewards/symbolic_reward_partial_score/std": 0.22192302346229553, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.074350118637085, "sampling/importance_sampling_ratio/min": 9.22444343132156e-08, "sampling/sampling_logp_difference/max": 16.198823928833008, "sampling/sampling_logp_difference/mean": 0.14467072486877441, "step": 1045 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.3004506677389145, "epoch": 2.7526315789473683, "grad_norm": 0.004989446606487036, "learning_rate": 1e-06, "loss": 0.0273, "step": 1046 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.3001198023557663, "epoch": 2.7552631578947366, "grad_norm": 0.0067935604602098465, "learning_rate": 1e-06, "loss": 0.0022, "step": 1047 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2999381124973297, "epoch": 2.7578947368421054, "grad_norm": 0.0061677745543420315, "learning_rate": 1e-06, "loss": -0.0001, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 344.810546875, "completions/mean_terminated_length": 344.810546875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.3047361671924591, "epoch": 2.7605263157894737, "frac_reward_zero_std": 0.6875, "grad_norm": 0.005580128636211157, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 231680421.0, "reward": 0.6251953840255737, "reward_std": 0.09940549731254578, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.61328125, "rewards/symbolic_reward_accuracy/std": 0.48747459053993225, "rewards/symbolic_reward_partial_score/mean": 0.857421875, "rewards/symbolic_reward_partial_score/std": 0.20798955857753754, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0773075819015503, "sampling/importance_sampling_ratio/min": 0.0017322030616924167, "sampling/sampling_logp_difference/max": 6.35836124420166, "sampling/sampling_logp_difference/mean": 0.1478845775127411, "step": 1049 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.301644891500473, "epoch": 2.763157894736842, "grad_norm": 0.005946438293904066, "learning_rate": 1e-06, "loss": -0.0001, "step": 1050 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.30532458424568176, "epoch": 2.765789473684211, "grad_norm": 0.008235945366322994, "learning_rate": 1e-06, "loss": 0.001, "step": 1051 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.30299271643161774, "epoch": 2.768421052631579, "grad_norm": 0.0029075501952320337, "learning_rate": 1e-06, "loss": -0.0012, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 343.134765625, "completions/mean_terminated_length": 343.134765625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.3021633177995682, "epoch": 2.7710526315789474, "frac_reward_zero_std": 0.625, "grad_norm": 0.005222593899816275, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 232260074.0, "reward": 0.7296382188796997, "reward_std": 0.08588685095310211, "rewards/progression_diversity/mean": -5.076556408312172e-05, "rewards/progression_diversity/std": 0.0011486934963613749, "rewards/symbolic_reward_accuracy/mean": 0.759765625, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.91259765625, "rewards/symbolic_reward_partial_score/std": 0.17004802823066711, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0771366357803345, "sampling/importance_sampling_ratio/min": 6.993500778484929e-10, "sampling/sampling_logp_difference/max": 21.080869674682617, "sampling/sampling_logp_difference/mean": 0.14611807465553284, "step": 1053 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.3046809434890747, "epoch": 2.7736842105263158, "grad_norm": 0.005133381113409996, "learning_rate": 1e-06, "loss": -0.0002, "step": 1054 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.29959002137184143, "epoch": 2.776315789473684, "grad_norm": 0.009239893406629562, "learning_rate": 1e-06, "loss": -0.0001, "step": 1055 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.3024829030036926, "epoch": 2.7789473684210524, "grad_norm": 0.008001086302101612, "learning_rate": 1e-06, "loss": 0.0017, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 366.3203125, "completions/mean_terminated_length": 334.97454833984375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.29493679106235504, "epoch": 2.781578947368421, "frac_reward_zero_std": 0.65625, "grad_norm": 0.006747152656316757, "learning_rate": 1e-06, "loss": 0.027, "num_tokens": 232852942.0, "reward": 0.6987649202346802, "reward_std": 0.0806761085987091, "rewards/progression_diversity/mean": -0.0014349485281854868, "rewards/progression_diversity/std": 0.03182978555560112, "rewards/symbolic_reward_accuracy/mean": 0.720703125, "rewards/symbolic_reward_accuracy/std": 0.44909247756004333, "rewards/symbolic_reward_partial_score/mean": 0.8878580927848816, "rewards/symbolic_reward_partial_score/std": 0.20636910200119019, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0740900039672852, "sampling/importance_sampling_ratio/min": 0.0002785032265819609, "sampling/sampling_logp_difference/max": 8.186080932617188, "sampling/sampling_logp_difference/mean": 0.1437458097934723, "step": 1057 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3012588322162628, "epoch": 2.7842105263157895, "grad_norm": 0.006597322411835194, "learning_rate": 1e-06, "loss": 0.0004, "step": 1058 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.30101022124290466, "epoch": 2.786842105263158, "grad_norm": 0.008424767293035984, "learning_rate": 1e-06, "loss": 0.0003, "step": 1059 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.30317388474941254, "epoch": 2.7894736842105265, "grad_norm": 0.0033637969754636288, "learning_rate": 1e-06, "loss": 0.0004, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 333.837890625, "completions/mean_terminated_length": 333.837890625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.29957854747772217, "epoch": 2.792105263157895, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0058438642881810665, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 233455707.0, "reward": 0.6902344226837158, "reward_std": 0.06666649132966995, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.693359375, "rewards/symbolic_reward_accuracy/std": 0.4615498185157776, "rewards/symbolic_reward_partial_score/mean": 0.9140625, "rewards/symbolic_reward_partial_score/std": 0.1394774317741394, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0762736797332764, "sampling/importance_sampling_ratio/min": 0.0014469054294750094, "sampling/sampling_logp_difference/max": 6.538328170776367, "sampling/sampling_logp_difference/mean": 0.14617058634757996, "step": 1061 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.29650571942329407, "epoch": 2.794736842105263, "grad_norm": 0.003479498904198408, "learning_rate": 1e-06, "loss": 0.0004, "step": 1062 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.3057336062192917, "epoch": 2.7973684210526315, "grad_norm": 0.01066509634256363, "learning_rate": 1e-06, "loss": 0.0, "step": 1063 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3051034063100815, "epoch": 2.8, "grad_norm": 0.004155632574111223, "learning_rate": 1e-06, "loss": -0.0006, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 358.43359375, "completions/mean_terminated_length": 327.0724182128906, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.3072111904621124, "epoch": 2.8026315789473686, "frac_reward_zero_std": 0.71875, "grad_norm": 0.0020747000817209482, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 234053881.0, "reward": 0.6687940955162048, "reward_std": 0.04760359972715378, "rewards/progression_diversity/mean": -0.000473738502478227, "rewards/progression_diversity/std": 0.00999171007424593, "rewards/symbolic_reward_accuracy/mean": 0.6796875, "rewards/symbolic_reward_accuracy/std": 0.4670529365539551, "rewards/symbolic_reward_partial_score/mean": 0.87060546875, "rewards/symbolic_reward_partial_score/std": 0.21114623546600342, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0746687650680542, "sampling/importance_sampling_ratio/min": 0.006617438979446888, "sampling/sampling_logp_difference/max": 5.018046855926514, "sampling/sampling_logp_difference/mean": 0.14455817639827728, "step": 1065 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.3023747056722641, "epoch": 2.805263157894737, "grad_norm": 0.004723989870399237, "learning_rate": 1e-06, "loss": 0.0001, "step": 1066 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.3038836717605591, "epoch": 2.807894736842105, "grad_norm": 0.0013983779354020953, "learning_rate": 1e-06, "loss": -0.0005, "step": 1067 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2940693497657776, "epoch": 2.8105263157894735, "grad_norm": 0.0014218107098713517, "learning_rate": 1e-06, "loss": 0.0293, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 325.5859375, "completions/mean_terminated_length": 325.5859375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.30475354194641113, "epoch": 2.8131578947368423, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0021612788550555706, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 234609189.0, "reward": 0.7227051258087158, "reward_std": 0.0341927669942379, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.74609375, "rewards/symbolic_reward_accuracy/std": 0.43567025661468506, "rewards/symbolic_reward_partial_score/mean": 0.9168294668197632, "rewards/symbolic_reward_partial_score/std": 0.15962786972522736, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0750136375427246, "sampling/importance_sampling_ratio/min": 0.0031064762733876705, "sampling/sampling_logp_difference/max": 5.774266242980957, "sampling/sampling_logp_difference/mean": 0.14548389613628387, "step": 1069 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2990523725748062, "epoch": 2.8157894736842106, "grad_norm": 0.006843502167612314, "learning_rate": 1e-06, "loss": 0.0005, "step": 1070 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2972310781478882, "epoch": 2.818421052631579, "grad_norm": 0.003569354536011815, "learning_rate": 1e-06, "loss": -0.0007, "step": 1071 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.30257561802864075, "epoch": 2.8210526315789473, "grad_norm": 0.0011047772131860256, "learning_rate": 1e-06, "loss": 0.0014, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 342.23828125, "completions/mean_terminated_length": 342.23828125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.29948727786540985, "epoch": 2.8236842105263156, "frac_reward_zero_std": 0.59375, "grad_norm": 0.004726483020931482, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 235189567.0, "reward": 0.7009764909744263, "reward_std": 0.09700538963079453, "rewards/progression_diversity/mean": -1.0271490282320883e-05, "rewards/progression_diversity/std": 0.00023241728194989264, "rewards/symbolic_reward_accuracy/mean": 0.716796875, "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, "rewards/symbolic_reward_partial_score/mean": 0.9029948115348816, "rewards/symbolic_reward_partial_score/std": 0.17346768081188202, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0757352113723755, "sampling/importance_sampling_ratio/min": 0.00030262480140663683, "sampling/sampling_logp_difference/max": 8.10301685333252, "sampling/sampling_logp_difference/mean": 0.1474587321281433, "step": 1073 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.29904985427856445, "epoch": 2.8263157894736843, "grad_norm": 0.005854406394064426, "learning_rate": 1e-06, "loss": 0.0004, "step": 1074 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.3006422519683838, "epoch": 2.8289473684210527, "grad_norm": 0.002233149018138647, "learning_rate": 1e-06, "loss": -0.0016, "step": 1075 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.30377885699272156, "epoch": 2.831578947368421, "grad_norm": 0.002246702089905739, "learning_rate": 1e-06, "loss": 0.0018, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 330.958984375, "completions/mean_terminated_length": 330.958984375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.3022722750902176, "epoch": 2.8342105263157897, "frac_reward_zero_std": 0.71875, "grad_norm": 0.006821685470640659, "learning_rate": 1e-06, "loss": -0.0012, "num_tokens": 235734794.0, "reward": 0.7696281671524048, "reward_std": 0.053564704954624176, "rewards/progression_diversity/mean": -7.848920358810574e-05, "rewards/progression_diversity/std": 0.0012581268092617393, "rewards/symbolic_reward_accuracy/mean": 0.814453125, "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, "rewards/symbolic_reward_partial_score/mean": 0.9365234375, "rewards/symbolic_reward_partial_score/std": 0.15159018337726593, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0741266012191772, "sampling/importance_sampling_ratio/min": 0.002895012963563204, "sampling/sampling_logp_difference/max": 5.844765663146973, "sampling/sampling_logp_difference/mean": 0.14536406099796295, "step": 1077 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2940616309642792, "epoch": 2.836842105263158, "grad_norm": 0.0018197406316176057, "learning_rate": 1e-06, "loss": 0.0011, "step": 1078 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2982686161994934, "epoch": 2.8394736842105264, "grad_norm": 0.0015975015703588724, "learning_rate": 1e-06, "loss": -0.0006, "step": 1079 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2993506193161011, "epoch": 2.8421052631578947, "grad_norm": 0.0066468799486756325, "learning_rate": 1e-06, "loss": 0.0008, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 334.35546875, "completions/mean_terminated_length": 334.35546875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "entropy": 0.29797013103961945, "epoch": 2.844736842105263, "frac_reward_zero_std": 0.625, "grad_norm": 0.007863621227443218, "learning_rate": 1e-06, "loss": 0.0011, "num_tokens": 236330464.0, "reward": 0.7291990518569946, "reward_std": 0.08173226565122604, "rewards/progression_diversity/mean": -1.640707705519162e-05, "rewards/progression_diversity/std": 0.00037124980008229613, "rewards/symbolic_reward_accuracy/mean": 0.755859375, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.9189453125, "rewards/symbolic_reward_partial_score/std": 0.1701214611530304, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0763745307922363, "sampling/importance_sampling_ratio/min": 0.0045486390590667725, "sampling/sampling_logp_difference/max": 5.392927169799805, "sampling/sampling_logp_difference/mean": 0.14487287402153015, "step": 1081 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.2998821288347244, "epoch": 2.8473684210526313, "grad_norm": 0.012033012695610523, "learning_rate": 1e-06, "loss": 0.0007, "step": 1082 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.3000533878803253, "epoch": 2.85, "grad_norm": 0.004561115987598896, "learning_rate": 1e-06, "loss": -0.0005, "step": 1083 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.29871946573257446, "epoch": 2.8526315789473684, "grad_norm": 0.002735659945756197, "learning_rate": 1e-06, "loss": 0.0002, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 324.82421875, "completions/mean_terminated_length": 324.82421875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.29053474962711334, "epoch": 2.8552631578947367, "frac_reward_zero_std": 0.59375, "grad_norm": 0.004544893279671669, "learning_rate": 1e-06, "loss": -0.0014, "num_tokens": 236888230.0, "reward": 0.7167476415634155, "reward_std": 0.08542241156101227, "rewards/progression_diversity/mean": -4.313869430916384e-05, "rewards/progression_diversity/std": 0.0007985467091202736, "rewards/symbolic_reward_accuracy/mean": 0.732421875, "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, "rewards/symbolic_reward_partial_score/mean": 0.92431640625, "rewards/symbolic_reward_partial_score/std": 0.14619389176368713, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0729988813400269, "sampling/importance_sampling_ratio/min": 0.010550652630627155, "sampling/sampling_logp_difference/max": 4.551567554473877, "sampling/sampling_logp_difference/mean": 0.14252474904060364, "step": 1085 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.29088248312473297, "epoch": 2.8578947368421055, "grad_norm": 0.004123357590287924, "learning_rate": 1e-06, "loss": 0.0005, "step": 1086 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2578125, "entropy": 0.2879483252763748, "epoch": 2.860526315789474, "grad_norm": 0.007720407098531723, "learning_rate": 1e-06, "loss": 0.0012, "step": 1087 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.25, "entropy": 0.2954999506473541, "epoch": 2.863157894736842, "grad_norm": 0.0022629539016634226, "learning_rate": 1e-06, "loss": 0.0006, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 331.248046875, "completions/mean_terminated_length": 331.248046875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.29624035954475403, "epoch": 2.8657894736842104, "frac_reward_zero_std": 0.625, "grad_norm": 0.009815968573093414, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 237464645.0, "reward": 0.759228527545929, "reward_std": 0.08564972877502441, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.80078125, "rewards/symbolic_reward_accuracy/std": 0.39980348944664, "rewards/symbolic_reward_partial_score/mean": 0.92919921875, "rewards/symbolic_reward_partial_score/std": 0.162931889295578, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.074161410331726, "sampling/importance_sampling_ratio/min": 1.4037870641914196e-05, "sampling/sampling_logp_difference/max": 11.173751831054688, "sampling/sampling_logp_difference/mean": 0.14445924758911133, "step": 1089 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2950233966112137, "epoch": 2.8684210526315788, "grad_norm": 0.0048875752836465836, "learning_rate": 1e-06, "loss": -0.0021, "step": 1090 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.09375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.2931629717350006, "epoch": 2.8710526315789475, "grad_norm": 0.0026949893217533827, "learning_rate": 1e-06, "loss": 0.002, "step": 1091 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.29851092398166656, "epoch": 2.873684210526316, "grad_norm": 0.004008137155324221, "learning_rate": 1e-06, "loss": -0.0006, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 332.041015625, "completions/mean_terminated_length": 332.041015625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.2946535646915436, "epoch": 2.876315789473684, "frac_reward_zero_std": 0.65625, "grad_norm": 0.003926341887563467, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 238041626.0, "reward": 0.749999463558197, "reward_std": 0.073408342897892, "rewards/progression_diversity/mean": -5.6169395975302905e-05, "rewards/progression_diversity/std": 0.00099645322188735, "rewards/symbolic_reward_accuracy/mean": 0.779296875, "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, "rewards/symbolic_reward_partial_score/mean": 0.94140625, "rewards/symbolic_reward_partial_score/std": 0.1294398009777069, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0737342834472656, "sampling/importance_sampling_ratio/min": 0.002836739644408226, "sampling/sampling_logp_difference/max": 5.865099906921387, "sampling/sampling_logp_difference/mean": 0.14539852738380432, "step": 1093 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.29140058159828186, "epoch": 2.8789473684210525, "grad_norm": 0.003920792136341333, "learning_rate": 1e-06, "loss": -0.0014, "step": 1094 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2940357029438019, "epoch": 2.8815789473684212, "grad_norm": 0.008678693324327469, "learning_rate": 1e-06, "loss": -0.0004, "step": 1095 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.29278014600276947, "epoch": 2.8842105263157896, "grad_norm": 0.0035982029512524605, "learning_rate": 1e-06, "loss": 0.0012, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 326.4765625, "completions/mean_terminated_length": 326.4765625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2947344183921814, "epoch": 2.886842105263158, "frac_reward_zero_std": 0.8125, "grad_norm": 0.007296448573470116, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 238592558.0, "reward": 0.7330079078674316, "reward_std": 0.05293193459510803, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.75390625, "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, "rewards/symbolic_reward_partial_score/mean": 0.935546875, "rewards/symbolic_reward_partial_score/std": 0.12586505711078644, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0746783018112183, "sampling/importance_sampling_ratio/min": 0.0004666089080274105, "sampling/sampling_logp_difference/max": 7.670019149780273, "sampling/sampling_logp_difference/mean": 0.1425882875919342, "step": 1097 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2921011298894882, "epoch": 2.889473684210526, "grad_norm": 0.00487733306363225, "learning_rate": 1e-06, "loss": 0.0009, "step": 1098 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.29850101470947266, "epoch": 2.8921052631578945, "grad_norm": 0.003617974929511547, "learning_rate": 1e-06, "loss": -0.0012, "step": 1099 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2937781363725662, "epoch": 2.8947368421052633, "grad_norm": 0.002885440830141306, "learning_rate": 1e-06, "loss": -0.0001, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 336.349609375, "completions/mean_terminated_length": 336.349609375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.292828232049942, "epoch": 2.8973684210526316, "frac_reward_zero_std": 0.59375, "grad_norm": 0.010650069452822208, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 239156225.0, "reward": 0.7841760516166687, "reward_std": 0.08980467170476913, "rewards/progression_diversity/mean": -0.0003705902199726552, "rewards/progression_diversity/std": 0.008385499007999897, "rewards/symbolic_reward_accuracy/mean": 0.83203125, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.9498697519302368, "rewards/symbolic_reward_partial_score/std": 0.12645751237869263, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0728933811187744, "sampling/importance_sampling_ratio/min": 0.0003060997696593404, "sampling/sampling_logp_difference/max": 8.091599464416504, "sampling/sampling_logp_difference/mean": 0.1435360312461853, "step": 1101 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.2941160798072815, "epoch": 2.9, "grad_norm": 0.004061248153448105, "learning_rate": 1e-06, "loss": -0.0007, "step": 1102 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2895169258117676, "epoch": 2.9026315789473687, "grad_norm": 0.002520245499908924, "learning_rate": 1e-06, "loss": 0.0017, "step": 1103 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2936854809522629, "epoch": 2.905263157894737, "grad_norm": 0.0031915016006678343, "learning_rate": 1e-06, "loss": -0.001, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 322.76171875, "completions/mean_terminated_length": 322.76171875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2948418855667114, "epoch": 2.9078947368421053, "frac_reward_zero_std": 0.625, "grad_norm": 0.007212890312075615, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 239736295.0, "reward": 0.7425780296325684, "reward_std": 0.08646564930677414, "rewards/progression_diversity/mean": -1.5266872651409358e-05, "rewards/progression_diversity/std": 0.00034544989466667175, "rewards/symbolic_reward_accuracy/mean": 0.771484375, "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, "rewards/symbolic_reward_partial_score/mean": 0.9322916269302368, "rewards/symbolic_reward_partial_score/std": 0.13940434157848358, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.073547601699829, "sampling/importance_sampling_ratio/min": 0.0009037154377438128, "sampling/sampling_logp_difference/max": 7.00899600982666, "sampling/sampling_logp_difference/mean": 0.14504487812519073, "step": 1105 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2899184674024582, "epoch": 2.9105263157894736, "grad_norm": 0.0034633921459317207, "learning_rate": 1e-06, "loss": 0.0001, "step": 1106 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.29138121008872986, "epoch": 2.913157894736842, "grad_norm": 0.0033394438214600086, "learning_rate": 1e-06, "loss": 0.0011, "step": 1107 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.292303666472435, "epoch": 2.9157894736842103, "grad_norm": 0.004198621492832899, "learning_rate": 1e-06, "loss": -0.0012, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 327.3125, "completions/mean_terminated_length": 327.3125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.29678307473659515, "epoch": 2.918421052631579, "frac_reward_zero_std": 0.6875, "grad_norm": 0.00880721490830183, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 240292327.0, "reward": 0.7222640514373779, "reward_std": 0.05592235177755356, "rewards/progression_diversity/mean": -0.00015940384764689952, "rewards/progression_diversity/std": 0.00271405978128314, "rewards/symbolic_reward_accuracy/mean": 0.748046875, "rewards/symbolic_reward_accuracy/std": 0.43455907702445984, "rewards/symbolic_reward_partial_score/mean": 0.9114583134651184, "rewards/symbolic_reward_partial_score/std": 0.1814991533756256, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.073778510093689, "sampling/importance_sampling_ratio/min": 6.552357808686793e-05, "sampling/sampling_logp_difference/max": 9.633100509643555, "sampling/sampling_logp_difference/mean": 0.14584583044052124, "step": 1109 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.29646173119544983, "epoch": 2.9210526315789473, "grad_norm": 0.0014886661665514112, "learning_rate": 1e-06, "loss": 0.0004, "step": 1110 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.29257290065288544, "epoch": 2.9236842105263157, "grad_norm": 0.00437309592962265, "learning_rate": 1e-06, "loss": -0.0006, "step": 1111 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2957308143377304, "epoch": 2.9263157894736844, "grad_norm": 0.0028557151090353727, "learning_rate": 1e-06, "loss": 0.0012, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 327.962890625, "completions/mean_terminated_length": 327.962890625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2977665513753891, "epoch": 2.9289473684210527, "frac_reward_zero_std": 0.65625, "grad_norm": 0.0022349138744175434, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 240855892.0, "reward": 0.7024399042129517, "reward_std": 0.05233702063560486, "rewards/progression_diversity/mean": -0.00014923579874448478, "rewards/progression_diversity/std": 0.0023670888040214777, "rewards/symbolic_reward_accuracy/mean": 0.71875, "rewards/symbolic_reward_accuracy/std": 0.45004892349243164, "rewards/symbolic_reward_partial_score/mean": 0.9039713740348816, "rewards/symbolic_reward_partial_score/std": 0.1682942807674408, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0742888450622559, "sampling/importance_sampling_ratio/min": 0.0002084570296574384, "sampling/sampling_logp_difference/max": 8.475777626037598, "sampling/sampling_logp_difference/mean": 0.14589810371398926, "step": 1113 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.29716482758522034, "epoch": 2.931578947368421, "grad_norm": 0.003905137535184622, "learning_rate": 1e-06, "loss": -0.0012, "step": 1114 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.29154519736766815, "epoch": 2.9342105263157894, "grad_norm": 0.0021905223838984966, "learning_rate": 1e-06, "loss": 0.0002, "step": 1115 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.29408158361911774, "epoch": 2.9368421052631577, "grad_norm": 0.004286495503038168, "learning_rate": 1e-06, "loss": -0.0001, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 328.890625, "completions/mean_terminated_length": 328.890625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2955780029296875, "epoch": 2.9394736842105265, "frac_reward_zero_std": 0.6875, "grad_norm": 0.009008477441966534, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 241443612.0, "reward": 0.6625969409942627, "reward_std": 0.04375801980495453, "rewards/progression_diversity/mean": -7.874615403125063e-05, "rewards/progression_diversity/std": 0.0017818220658227801, "rewards/symbolic_reward_accuracy/mean": 0.66015625, "rewards/symbolic_reward_accuracy/std": 0.4741191864013672, "rewards/symbolic_reward_partial_score/mean": 0.8883463144302368, "rewards/symbolic_reward_partial_score/std": 0.17636752128601074, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0732474327087402, "sampling/importance_sampling_ratio/min": 0.0015634673181921244, "sampling/sampling_logp_difference/max": 6.460849285125732, "sampling/sampling_logp_difference/mean": 0.14395904541015625, "step": 1117 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.28961172699928284, "epoch": 2.942105263157895, "grad_norm": 0.0067161088809370995, "learning_rate": 1e-06, "loss": -0.0002, "step": 1118 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2924419641494751, "epoch": 2.944736842105263, "grad_norm": 0.004618462640792131, "learning_rate": 1e-06, "loss": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.29112420976161957, "epoch": 2.9473684210526314, "grad_norm": 0.0027965642511844635, "learning_rate": 1e-06, "loss": 0.0005, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 321.8125, "completions/mean_terminated_length": 321.8125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2882765978574753, "epoch": 2.95, "frac_reward_zero_std": 0.75, "grad_norm": 0.003613464767113328, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 242024220.0, "reward": 0.6606934070587158, "reward_std": 0.06243230402469635, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.65625, "rewards/symbolic_reward_accuracy/std": 0.4754233956336975, "rewards/symbolic_reward_partial_score/mean": 0.8898111581802368, "rewards/symbolic_reward_partial_score/std": 0.17068804800510406, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0739712715148926, "sampling/importance_sampling_ratio/min": 0.00012918752327095717, "sampling/sampling_logp_difference/max": 8.954245567321777, "sampling/sampling_logp_difference/mean": 0.14313524961471558, "step": 1121 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2939921021461487, "epoch": 2.9526315789473685, "grad_norm": 0.0043477327562868595, "learning_rate": 1e-06, "loss": -0.0, "step": 1122 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.28974662721157074, "epoch": 2.955263157894737, "grad_norm": 0.005428643431514502, "learning_rate": 1e-06, "loss": 0.0012, "step": 1123 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2926223576068878, "epoch": 2.957894736842105, "grad_norm": 0.0011934597278013825, "learning_rate": 1e-06, "loss": -0.0008, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 315.794921875, "completions/mean_terminated_length": 315.794921875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.28804731369018555, "epoch": 2.9605263157894735, "frac_reward_zero_std": 0.625, "grad_norm": 0.007428732700645924, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 242597395.0, "reward": 0.7770508527755737, "reward_std": 0.08177044987678528, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8203125, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.9495442509651184, "rewards/symbolic_reward_partial_score/std": 0.1216139942407608, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0729912519454956, "sampling/importance_sampling_ratio/min": 0.000365082873031497, "sampling/sampling_logp_difference/max": 7.915386199951172, "sampling/sampling_logp_difference/mean": 0.1422921121120453, "step": 1125 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2904288172721863, "epoch": 2.963157894736842, "grad_norm": 0.006175660528242588, "learning_rate": 1e-06, "loss": -0.0006, "step": 1126 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.28833121061325073, "epoch": 2.9657894736842105, "grad_norm": 0.0024276673793792725, "learning_rate": 1e-06, "loss": -0.0004, "step": 1127 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.29089853167533875, "epoch": 2.968421052631579, "grad_norm": 0.002199800219386816, "learning_rate": 1e-06, "loss": -0.0005, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 382.857421875, "completions/mean_terminated_length": 320.10784912109375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.28097280859947205, "epoch": 2.9710526315789476, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0047841547057032585, "learning_rate": 1e-06, "loss": 0.0109, "num_tokens": 243197226.0, "reward": 0.6922287940979004, "reward_std": 0.10817395895719528, "rewards/progression_diversity/mean": -0.0007603330886922777, "rewards/progression_diversity/std": 0.015662960708141327, "rewards/symbolic_reward_accuracy/mean": 0.708984375, "rewards/symbolic_reward_accuracy/std": 0.45467492938041687, "rewards/symbolic_reward_partial_score/mean": 0.8907877802848816, "rewards/symbolic_reward_partial_score/std": 0.19231903553009033, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071429967880249, "sampling/importance_sampling_ratio/min": 0.001035291119478643, "sampling/sampling_logp_difference/max": 6.873072624206543, "sampling/sampling_logp_difference/mean": 0.1391676664352417, "step": 1129 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.29636819660663605, "epoch": 2.973684210526316, "grad_norm": 0.005953166633844376, "learning_rate": 1e-06, "loss": 0.0001, "step": 1130 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.2874244600534439, "epoch": 2.9763157894736842, "grad_norm": 0.0069323936477303505, "learning_rate": 1e-06, "loss": -0.0007, "step": 1131 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.292653352022171, "epoch": 2.9789473684210526, "grad_norm": 0.008772444911301136, "learning_rate": 1e-06, "loss": 0.0096, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 381.43359375, "completions/mean_terminated_length": 318.6784362792969, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.2889240086078644, "epoch": 2.981578947368421, "frac_reward_zero_std": 0.46875, "grad_norm": 0.01059815101325512, "learning_rate": 1e-06, "loss": 0.0317, "num_tokens": 243830536.0, "reward": 0.6876782178878784, "reward_std": 0.1107964962720871, "rewards/progression_diversity/mean": -0.0017093454953283072, "rewards/progression_diversity/std": 0.03062625229358673, "rewards/symbolic_reward_accuracy/mean": 0.6953125, "rewards/symbolic_reward_accuracy/std": 0.4607250988483429, "rewards/symbolic_reward_partial_score/mean": 0.9029947519302368, "rewards/symbolic_reward_partial_score/std": 0.1694251000881195, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0704015493392944, "sampling/importance_sampling_ratio/min": 0.00027761273668147624, "sampling/sampling_logp_difference/max": 8.18928337097168, "sampling/sampling_logp_difference/mean": 0.1361396610736847, "step": 1133 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.28125, "entropy": 0.2885236442089081, "epoch": 2.984210526315789, "grad_norm": 0.0028568825218826532, "learning_rate": 1e-06, "loss": 0.0006, "step": 1134 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.28880971670150757, "epoch": 2.986842105263158, "grad_norm": 0.0019433270208537579, "learning_rate": 1e-06, "loss": -0.0019, "step": 1135 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.203125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.265625, "entropy": 0.28411930799484253, "epoch": 2.9894736842105263, "grad_norm": 0.006811541970819235, "learning_rate": 1e-06, "loss": 0.0082, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 317.9765625, "completions/mean_terminated_length": 317.9765625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.29524092376232147, "epoch": 2.9921052631578946, "frac_reward_zero_std": 0.75, "grad_norm": 0.0021421704441308975, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 244400316.0, "reward": 0.8205077648162842, "reward_std": 0.03574543446302414, "rewards/progression_diversity/mean": -6.387043413269566e-06, "rewards/progression_diversity/std": 0.0001445223024347797, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9615885019302368, "rewards/symbolic_reward_partial_score/std": 0.13740219175815582, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0735294818878174, "sampling/importance_sampling_ratio/min": 0.0010848650708794594, "sampling/sampling_logp_difference/max": 6.826299667358398, "sampling/sampling_logp_difference/mean": 0.14254550635814667, "step": 1137 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.29247477650642395, "epoch": 2.9947368421052634, "grad_norm": 0.0035705517511814833, "learning_rate": 1e-06, "loss": 0.0006, "step": 1138 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.29122480750083923, "epoch": 2.9973684210526317, "grad_norm": 0.003994923550635576, "learning_rate": 1e-06, "loss": 0.0003, "step": 1139 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2921169400215149, "epoch": 3.0, "grad_norm": 0.0015517091378569603, "learning_rate": 1e-06, "loss": 0.0004, "step": 1140 }, { "epoch": 3.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.000244140625, "eval_completions/max_length": 1206.28125, "eval_completions/max_terminated_length": 711.5625, "eval_completions/mean_length": 320.849609375, "eval_completions/mean_terminated_length": 316.92619037628174, "eval_completions/min_length": 157.03125, "eval_completions/min_terminated_length": 157.03125, "eval_entropy": 0.28882430028170347, "eval_frac_reward_zero_std": 0.77734375, "eval_loss": 0.0005815211334265769, "eval_num_tokens": 244400316.0, "eval_reward": 0.7942132484167814, "eval_reward_std": 0.044389907372533344, "eval_rewards/progression_diversity/mean": -6.53385166060616e-05, "eval_rewards/progression_diversity/std": 0.0007133460931072477, "eval_rewards/symbolic_reward_accuracy/mean": 0.85400390625, "eval_rewards/symbolic_reward_accuracy/std": 0.3091083026956767, "eval_rewards/symbolic_reward_partial_score/mean": 0.942626953125, "eval_rewards/symbolic_reward_partial_score/std": 0.13389483519131318, "eval_rewards/tag_count_reward/mean": -0.009765625, "eval_rewards/tag_count_reward/std": 0.04943125694990158, "eval_runtime": 168.3957, "eval_samples_per_second": 1.485, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.071236602962017, "eval_sampling/importance_sampling_ratio/min": 0.003390098211092703, "eval_sampling/sampling_logp_difference/max": 20.573874562978745, "eval_sampling/sampling_logp_difference/mean": 0.14651945093646646, "eval_steps_per_second": 0.012, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 315.07421875, "completions/mean_terminated_length": 315.07421875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.28790298104286194, "epoch": 3.0026315789473683, "frac_reward_zero_std": 0.5625, "grad_norm": 0.010073988698422909, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 244965442.0, "reward": 0.741455078125, "reward_std": 0.10208778083324432, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.771484375, "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, "rewards/symbolic_reward_partial_score/mean": 0.9285481572151184, "rewards/symbolic_reward_partial_score/std": 0.14986184239387512, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0723671913146973, "sampling/importance_sampling_ratio/min": 0.0002522034337744117, "sampling/sampling_logp_difference/max": 8.285274505615234, "sampling/sampling_logp_difference/mean": 0.14196112751960754, "step": 1141 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1953125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.2821872681379318, "epoch": 3.0052631578947366, "grad_norm": 0.010949229821562767, "learning_rate": 1e-06, "loss": -0.0002, "step": 1142 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.29011546075344086, "epoch": 3.0078947368421054, "grad_norm": 0.01058491412550211, "learning_rate": 1e-06, "loss": -0.0005, "step": 1143 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.28657861053943634, "epoch": 3.0105263157894737, "grad_norm": 0.006004285998642445, "learning_rate": 1e-06, "loss": 0.0012, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 322.423828125, "completions/mean_terminated_length": 322.423828125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.28619617223739624, "epoch": 3.013157894736842, "frac_reward_zero_std": 0.75, "grad_norm": 0.005281769670546055, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 245537499.0, "reward": 0.7984375357627869, "reward_std": 0.04122573137283325, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.9505208730697632, "rewards/symbolic_reward_partial_score/std": 0.12660686671733856, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0715934038162231, "sampling/importance_sampling_ratio/min": 0.0029423898085951805, "sampling/sampling_logp_difference/max": 5.828533172607422, "sampling/sampling_logp_difference/mean": 0.14253807067871094, "step": 1145 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2869596779346466, "epoch": 3.0157894736842104, "grad_norm": 0.0015528675867244601, "learning_rate": 1e-06, "loss": -0.001, "step": 1146 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2859642207622528, "epoch": 3.018421052631579, "grad_norm": 0.009877209551632404, "learning_rate": 1e-06, "loss": 0.0009, "step": 1147 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2844521105289459, "epoch": 3.0210526315789474, "grad_norm": 0.00118832488078624, "learning_rate": 1e-06, "loss": 0.0008, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 319.44921875, "completions/mean_terminated_length": 319.44921875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.2846902459859848, "epoch": 3.0236842105263158, "frac_reward_zero_std": 0.71875, "grad_norm": 0.003452679142355919, "learning_rate": 1e-06, "loss": -0.0009, "num_tokens": 246112705.0, "reward": 0.7942871451377869, "reward_std": 0.06831326335668564, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.9366861581802368, "rewards/symbolic_reward_partial_score/std": 0.1584509164094925, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0727237462997437, "sampling/importance_sampling_ratio/min": 0.00042281081550754607, "sampling/sampling_logp_difference/max": 7.768585681915283, "sampling/sampling_logp_difference/mean": 0.142282634973526, "step": 1149 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.28898128867149353, "epoch": 3.026315789473684, "grad_norm": 0.002851970260962844, "learning_rate": 1e-06, "loss": 0.0003, "step": 1150 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.28988291323184967, "epoch": 3.028947368421053, "grad_norm": 0.0016596262576058507, "learning_rate": 1e-06, "loss": 0.0013, "step": 1151 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.28207847476005554, "epoch": 3.031578947368421, "grad_norm": 0.0025477732997387648, "learning_rate": 1e-06, "loss": 0.0003, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 320.615234375, "completions/mean_terminated_length": 320.615234375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "entropy": 0.2825598418712616, "epoch": 3.0342105263157895, "frac_reward_zero_std": 0.65625, "grad_norm": 0.003103437600657344, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 246672988.0, "reward": 0.810498058795929, "reward_std": 0.0731193870306015, "rewards/progression_diversity/mean": -1.2725857914119842e-06, "rewards/progression_diversity/std": 2.8795329853892326e-05, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.93994140625, "rewards/symbolic_reward_partial_score/std": 0.17745040357112885, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071824550628662, "sampling/importance_sampling_ratio/min": 0.0028972846921533346, "sampling/sampling_logp_difference/max": 5.8439812660217285, "sampling/sampling_logp_difference/mean": 0.14137542247772217, "step": 1153 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.28350427746772766, "epoch": 3.036842105263158, "grad_norm": 0.0032226620241999626, "learning_rate": 1e-06, "loss": 0.0, "step": 1154 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2870677411556244, "epoch": 3.039473684210526, "grad_norm": 0.00388253852725029, "learning_rate": 1e-06, "loss": 0.0004, "step": 1155 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2872246205806732, "epoch": 3.042105263157895, "grad_norm": 0.0034006177447736263, "learning_rate": 1e-06, "loss": 0.001, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 844.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 314.228515625, "completions/mean_terminated_length": 314.228515625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.28370508551597595, "epoch": 3.044736842105263, "frac_reward_zero_std": 0.6875, "grad_norm": 0.00431436114013195, "learning_rate": 1e-06, "loss": -0.0015, "num_tokens": 247245201.0, "reward": 0.7706543207168579, "reward_std": 0.07343325018882751, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8203125, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.92822265625, "rewards/symbolic_reward_partial_score/std": 0.16191674768924713, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0711597204208374, "sampling/importance_sampling_ratio/min": 7.088058919180185e-05, "sampling/sampling_logp_difference/max": 9.554513931274414, "sampling/sampling_logp_difference/mean": 0.1412719488143921, "step": 1157 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2819001376628876, "epoch": 3.0473684210526315, "grad_norm": 0.005949938669800758, "learning_rate": 1e-06, "loss": 0.0016, "step": 1158 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.2845749258995056, "epoch": 3.05, "grad_norm": 0.009011217392981052, "learning_rate": 1e-06, "loss": 0.0034, "step": 1159 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.28015366196632385, "epoch": 3.0526315789473686, "grad_norm": 0.0038384415674954653, "learning_rate": 1e-06, "loss": -0.0022, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 315.591796875, "completions/mean_terminated_length": 315.591796875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.27949464321136475, "epoch": 3.055263157894737, "frac_reward_zero_std": 0.6875, "grad_norm": 0.004537342581897974, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 247801568.0, "reward": 0.7544918060302734, "reward_std": 0.05182567238807678, "rewards/progression_diversity/mean": -4.503194213612005e-05, "rewards/progression_diversity/std": 0.0010189565364271402, "rewards/symbolic_reward_accuracy/mean": 0.796875, "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, "rewards/symbolic_reward_partial_score/mean": 0.9212239980697632, "rewards/symbolic_reward_partial_score/std": 0.1719035506248474, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0718927383422852, "sampling/importance_sampling_ratio/min": 1.6995967598631978e-05, "sampling/sampling_logp_difference/max": 10.982534408569336, "sampling/sampling_logp_difference/mean": 0.14235100150108337, "step": 1161 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2826980799436569, "epoch": 3.057894736842105, "grad_norm": 0.00524579593911767, "learning_rate": 1e-06, "loss": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2870752066373825, "epoch": 3.0605263157894735, "grad_norm": 0.005643745418637991, "learning_rate": 1e-06, "loss": 0.0013, "step": 1163 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2850530445575714, "epoch": 3.0631578947368423, "grad_norm": 0.007966126315295696, "learning_rate": 1e-06, "loss": -0.0002, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 321.208984375, "completions/mean_terminated_length": 321.208984375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.2806635946035385, "epoch": 3.0657894736842106, "frac_reward_zero_std": 0.78125, "grad_norm": 0.004814115818589926, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 248375691.0, "reward": 0.7567381858825684, "reward_std": 0.047071587294340134, "rewards/progression_diversity/mean": -1.0009222933149431e-05, "rewards/progression_diversity/std": 0.00022648285084869713, "rewards/symbolic_reward_accuracy/mean": 0.80078125, "rewards/symbolic_reward_accuracy/std": 0.39980348944664, "rewards/symbolic_reward_partial_score/mean": 0.9208984375, "rewards/symbolic_reward_partial_score/std": 0.16742649674415588, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071966528892517, "sampling/importance_sampling_ratio/min": 0.00016959384083747864, "sampling/sampling_logp_difference/max": 8.682104110717773, "sampling/sampling_logp_difference/mean": 0.1426159143447876, "step": 1165 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.28873564302921295, "epoch": 3.068421052631579, "grad_norm": 0.0025189740117639303, "learning_rate": 1e-06, "loss": -0.0006, "step": 1166 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2868616282939911, "epoch": 3.0710526315789473, "grad_norm": 0.0013349374057725072, "learning_rate": 1e-06, "loss": -0.0009, "step": 1167 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.28100864589214325, "epoch": 3.0736842105263156, "grad_norm": 0.002027245005592704, "learning_rate": 1e-06, "loss": 0.0012, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 315.712890625, "completions/mean_terminated_length": 315.712890625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.285733699798584, "epoch": 3.0763157894736843, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0012603303184732795, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 248913112.0, "reward": 0.800585150718689, "reward_std": 0.02160678058862686, "rewards/progression_diversity/mean": -8.479159441776574e-05, "rewards/progression_diversity/std": 0.0017334631411358714, "rewards/symbolic_reward_accuracy/mean": 0.865234375, "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, "rewards/symbolic_reward_partial_score/mean": 0.9381510615348816, "rewards/symbolic_reward_partial_score/std": 0.16102522611618042, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071446418762207, "sampling/importance_sampling_ratio/min": 1.3583598246214024e-08, "sampling/sampling_logp_difference/max": 18.114402770996094, "sampling/sampling_logp_difference/mean": 0.14119446277618408, "step": 1169 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.28660933673381805, "epoch": 3.0789473684210527, "grad_norm": 0.0005353147862479091, "learning_rate": 1e-06, "loss": -0.0008, "step": 1170 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2914620041847229, "epoch": 3.081578947368421, "grad_norm": 0.0039962842129170895, "learning_rate": 1e-06, "loss": 0.0002, "step": 1171 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2793559432029724, "epoch": 3.0842105263157893, "grad_norm": 0.004289968870580196, "learning_rate": 1e-06, "loss": 0.0003, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 313.99609375, "completions/mean_terminated_length": 313.99609375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "entropy": 0.2798050791025162, "epoch": 3.086842105263158, "frac_reward_zero_std": 0.75, "grad_norm": 0.004182688891887665, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 249467158.0, "reward": 0.8178223371505737, "reward_std": 0.051801957190036774, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.95263671875, "rewards/symbolic_reward_partial_score/std": 0.13850493729114532, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0719172954559326, "sampling/importance_sampling_ratio/min": 4.298911881051026e-05, "sampling/sampling_logp_difference/max": 10.054563522338867, "sampling/sampling_logp_difference/mean": 0.1404031664133072, "step": 1173 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.28199082612991333, "epoch": 3.0894736842105264, "grad_norm": 0.0009707204881124198, "learning_rate": 1e-06, "loss": 0.0009, "step": 1174 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.28277716040611267, "epoch": 3.0921052631578947, "grad_norm": 0.006554114166647196, "learning_rate": 1e-06, "loss": -0.0004, "step": 1175 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.27743278443813324, "epoch": 3.094736842105263, "grad_norm": 0.001463320222683251, "learning_rate": 1e-06, "loss": 0.0001, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 317.107421875, "completions/mean_terminated_length": 317.107421875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.27970318496227264, "epoch": 3.0973684210526318, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0029366682283580303, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 250034989.0, "reward": 0.7665039300918579, "reward_std": 0.03309086337685585, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.80859375, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.9378255009651184, "rewards/symbolic_reward_partial_score/std": 0.14253705739974976, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.071006178855896, "sampling/importance_sampling_ratio/min": 0.00015880883438512683, "sampling/sampling_logp_difference/max": 8.747809410095215, "sampling/sampling_logp_difference/mean": 0.14218075573444366, "step": 1177 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.28671663999557495, "epoch": 3.1, "grad_norm": 0.007308136206120253, "learning_rate": 1e-06, "loss": 0.0006, "step": 1178 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.28224754333496094, "epoch": 3.1026315789473684, "grad_norm": 0.0011736652813851833, "learning_rate": 1e-06, "loss": -0.0004, "step": 1179 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.28825029730796814, "epoch": 3.1052631578947367, "grad_norm": 0.0011161916190758348, "learning_rate": 1e-06, "loss": -0.0003, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 316.990234375, "completions/mean_terminated_length": 316.990234375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2790067791938782, "epoch": 3.1078947368421055, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0015674019232392311, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 250602600.0, "reward": 0.7606445550918579, "reward_std": 0.05273447930812836, "rewards/progression_diversity/mean": -2.454866489642882e-06, "rewards/progression_diversity/std": 5.5547287047374994e-05, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.9222005605697632, "rewards/symbolic_reward_partial_score/std": 0.17716993391513824, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0702251195907593, "sampling/importance_sampling_ratio/min": 0.0002612802491057664, "sampling/sampling_logp_difference/max": 8.249917030334473, "sampling/sampling_logp_difference/mean": 0.13987314701080322, "step": 1181 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.28073863685131073, "epoch": 3.110526315789474, "grad_norm": 0.001168043352663517, "learning_rate": 1e-06, "loss": -0.0, "step": 1182 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.27858008444309235, "epoch": 3.113157894736842, "grad_norm": 0.0010491248685866594, "learning_rate": 1e-06, "loss": 0.0013, "step": 1183 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.28045469522476196, "epoch": 3.1157894736842104, "grad_norm": 0.007387771271169186, "learning_rate": 1e-06, "loss": -0.0003, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 343.73828125, "completions/mean_terminated_length": 312.34832763671875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.2847536504268646, "epoch": 3.1184210526315788, "frac_reward_zero_std": 0.8125, "grad_norm": 0.001716139609925449, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 251185730.0, "reward": 0.7362680435180664, "reward_std": 0.028277598321437836, "rewards/progression_diversity/mean": -0.0011275196447968483, "rewards/progression_diversity/std": 0.02525327354669571, "rewards/symbolic_reward_accuracy/mean": 0.767578125, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.9191080927848816, "rewards/symbolic_reward_partial_score/std": 0.16660796105861664, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0696632862091064, "sampling/importance_sampling_ratio/min": 4.833166167372838e-05, "sampling/sampling_logp_difference/max": 9.937423706054688, "sampling/sampling_logp_difference/mean": 0.13752613961696625, "step": 1185 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.27466967701911926, "epoch": 3.1210526315789475, "grad_norm": 0.003341392381116748, "learning_rate": 1e-06, "loss": 0.0292, "step": 1186 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2820413112640381, "epoch": 3.123684210526316, "grad_norm": 0.0033147488720715046, "learning_rate": 1e-06, "loss": -0.0008, "step": 1187 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.278613805770874, "epoch": 3.126315789473684, "grad_norm": 0.0009309493470937014, "learning_rate": 1e-06, "loss": -0.0012, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 350.95703125, "completions/mean_terminated_length": 319.5812072753906, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.2692241817712784, "epoch": 3.1289473684210525, "frac_reward_zero_std": 0.8125, "grad_norm": 0.007036528550088406, "learning_rate": 1e-06, "loss": 0.0301, "num_tokens": 251774220.0, "reward": 0.7931602001190186, "reward_std": 0.043564073741436005, "rewards/progression_diversity/mean": -0.00038891550502739847, "rewards/progression_diversity/std": 0.008800153620541096, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.9368489980697632, "rewards/symbolic_reward_partial_score/std": 0.165929913520813, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0689858198165894, "sampling/importance_sampling_ratio/min": 0.0024728423450142145, "sampling/sampling_logp_difference/max": 6.002387046813965, "sampling/sampling_logp_difference/mean": 0.13437007367610931, "step": 1189 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.28147004544734955, "epoch": 3.1315789473684212, "grad_norm": 0.004228492267429829, "learning_rate": 1e-06, "loss": -0.0009, "step": 1190 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.27740536630153656, "epoch": 3.1342105263157896, "grad_norm": 0.002545455237850547, "learning_rate": 1e-06, "loss": 0.0001, "step": 1191 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.275810107588768, "epoch": 3.136842105263158, "grad_norm": 0.0027159368619322777, "learning_rate": 1e-06, "loss": -0.0001, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 311.162109375, "completions/mean_terminated_length": 311.162109375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.277396023273468, "epoch": 3.139473684210526, "frac_reward_zero_std": 0.8125, "grad_norm": 0.002241811016574502, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 252345503.0, "reward": 0.7347656488418579, "reward_std": 0.04471675306558609, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.767578125, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.9140625, "rewards/symbolic_reward_partial_score/std": 0.17476648092269897, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069706916809082, "sampling/importance_sampling_ratio/min": 0.0019460662733763456, "sampling/sampling_logp_difference/max": 6.241945266723633, "sampling/sampling_logp_difference/mean": 0.13861989974975586, "step": 1193 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.27622032165527344, "epoch": 3.1421052631578945, "grad_norm": 0.008059216663241386, "learning_rate": 1e-06, "loss": 0.0002, "step": 1194 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2781420797109604, "epoch": 3.1447368421052633, "grad_norm": 0.003802333725616336, "learning_rate": 1e-06, "loss": 0.0005, "step": 1195 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.27944231033325195, "epoch": 3.1473684210526316, "grad_norm": 0.0019333260133862495, "learning_rate": 1e-06, "loss": -0.0009, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 315.603515625, "completions/mean_terminated_length": 315.603515625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.2817239761352539, "epoch": 3.15, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0066985213197767735, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 252910900.0, "reward": 0.7115720510482788, "reward_std": 0.038549572229385376, "rewards/progression_diversity/mean": -1.895956665975973e-05, "rewards/progression_diversity/std": 0.0004290060023777187, "rewards/symbolic_reward_accuracy/mean": 0.73828125, "rewards/symbolic_reward_accuracy/std": 0.44000017642974854, "rewards/symbolic_reward_partial_score/mean": 0.8953450918197632, "rewards/symbolic_reward_partial_score/std": 0.18525949120521545, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0701593160629272, "sampling/importance_sampling_ratio/min": 0.0006146501400507987, "sampling/sampling_logp_difference/max": 7.3944573402404785, "sampling/sampling_logp_difference/mean": 0.14031648635864258, "step": 1197 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.28350114822387695, "epoch": 3.1526315789473682, "grad_norm": 0.0015183095820248127, "learning_rate": 1e-06, "loss": -0.0006, "step": 1198 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.27260275185108185, "epoch": 3.155263157894737, "grad_norm": 0.003470479976385832, "learning_rate": 1e-06, "loss": 0.0014, "step": 1199 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2772819846868515, "epoch": 3.1578947368421053, "grad_norm": 0.007544191554188728, "learning_rate": 1e-06, "loss": -0.0007, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 311.41796875, "completions/mean_terminated_length": 311.41796875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.2722645252943039, "epoch": 3.1605263157894736, "frac_reward_zero_std": 0.875, "grad_norm": 0.0009831043425947428, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 253476138.0, "reward": 0.696484386920929, "reward_std": 0.02265625074505806, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.71484375, "rewards/symbolic_reward_accuracy/std": 0.45193037390708923, "rewards/symbolic_reward_partial_score/mean": 0.8919271230697632, "rewards/symbolic_reward_partial_score/std": 0.19320917129516602, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069796085357666, "sampling/importance_sampling_ratio/min": 0.0006912931567057967, "sampling/sampling_logp_difference/max": 7.276946544647217, "sampling/sampling_logp_difference/mean": 0.1394611895084381, "step": 1201 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.280326783657074, "epoch": 3.163157894736842, "grad_norm": 0.0007917368202470243, "learning_rate": 1e-06, "loss": -0.0003, "step": 1202 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.27794717252254486, "epoch": 3.1657894736842107, "grad_norm": 0.0003407469193916768, "learning_rate": 1e-06, "loss": 0.0005, "step": 1203 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.27963490784168243, "epoch": 3.168421052631579, "grad_norm": 0.0004433590220287442, "learning_rate": 1e-06, "loss": -0.0007, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 346.998046875, "completions/mean_terminated_length": 315.6144714355469, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2839759588241577, "epoch": 3.1710526315789473, "frac_reward_zero_std": 0.78125, "grad_norm": 0.003669463098049164, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 254036937.0, "reward": 0.8009728789329529, "reward_std": 0.04420147091150284, "rewards/progression_diversity/mean": -0.00037003474426455796, "rewards/progression_diversity/std": 0.007489512208849192, "rewards/symbolic_reward_accuracy/mean": 0.869140625, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.9322916865348816, "rewards/symbolic_reward_partial_score/std": 0.17563912272453308, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0679644346237183, "sampling/importance_sampling_ratio/min": 6.135964940767735e-05, "sampling/sampling_logp_difference/max": 9.698758125305176, "sampling/sampling_logp_difference/mean": 0.1385321319103241, "step": 1205 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.28549641370773315, "epoch": 3.1736842105263157, "grad_norm": 0.006869847420603037, "learning_rate": 1e-06, "loss": -0.0004, "step": 1206 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2799648195505142, "epoch": 3.1763157894736844, "grad_norm": 0.002433902584016323, "learning_rate": 1e-06, "loss": 0.0004, "step": 1207 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.27347370982170105, "epoch": 3.1789473684210527, "grad_norm": 0.0075611830689013, "learning_rate": 1e-06, "loss": 0.006, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 309.787109375, "completions/mean_terminated_length": 309.787109375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "entropy": 0.27822238206863403, "epoch": 3.181578947368421, "frac_reward_zero_std": 0.65625, "grad_norm": 0.009059883654117584, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 254579324.0, "reward": 0.728661298751831, "reward_std": 0.05693569406867027, "rewards/progression_diversity/mean": -8.019153028726578e-05, "rewards/progression_diversity/std": 0.0014274234417825937, "rewards/symbolic_reward_accuracy/mean": 0.7578125, "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, "rewards/symbolic_reward_partial_score/mean": 0.9132487177848816, "rewards/symbolic_reward_partial_score/std": 0.1662641316652298, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.070951223373413, "sampling/importance_sampling_ratio/min": 0.00033222284400835633, "sampling/sampling_logp_difference/max": 8.00970458984375, "sampling/sampling_logp_difference/mean": 0.14075878262519836, "step": 1209 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.28292274475097656, "epoch": 3.1842105263157894, "grad_norm": 0.0033197884913533926, "learning_rate": 1e-06, "loss": 0.0006, "step": 1210 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2789665162563324, "epoch": 3.1868421052631577, "grad_norm": 0.0019438323797658086, "learning_rate": 1e-06, "loss": 0.0007, "step": 1211 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.28156426548957825, "epoch": 3.1894736842105265, "grad_norm": 0.006589422933757305, "learning_rate": 1e-06, "loss": -0.0002, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 310.138671875, "completions/mean_terminated_length": 310.138671875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.27788078784942627, "epoch": 3.192105263157895, "frac_reward_zero_std": 0.875, "grad_norm": 0.0016114837490022182, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 255143587.0, "reward": 0.7333984971046448, "reward_std": 0.02498510479927063, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.76953125, "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, "rewards/symbolic_reward_partial_score/mean": 0.9055989384651184, "rewards/symbolic_reward_partial_score/std": 0.18524064123630524, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0696802139282227, "sampling/importance_sampling_ratio/min": 7.757533353469626e-07, "sampling/sampling_logp_difference/max": 14.06943130493164, "sampling/sampling_logp_difference/mean": 0.1414531171321869, "step": 1213 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2840680330991745, "epoch": 3.194736842105263, "grad_norm": 0.0007972043240442872, "learning_rate": 1e-06, "loss": 0.0002, "step": 1214 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.27601639926433563, "epoch": 3.1973684210526314, "grad_norm": 0.00040065511711873114, "learning_rate": 1e-06, "loss": -0.0002, "step": 1215 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2749394178390503, "epoch": 3.2, "grad_norm": 0.005288100801408291, "learning_rate": 1e-06, "loss": -0.0003, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 310.4296875, "completions/mean_terminated_length": 310.4296875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.2765820473432541, "epoch": 3.2026315789473685, "frac_reward_zero_std": 0.78125, "grad_norm": 0.002937893383204937, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 255726687.0, "reward": 0.7673335075378418, "reward_std": 0.03701866418123245, "rewards/progression_diversity/mean": -5.3788204240845516e-05, "rewards/progression_diversity/std": 0.00121708819642663, "rewards/symbolic_reward_accuracy/mean": 0.810546875, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.9366862177848816, "rewards/symbolic_reward_partial_score/std": 0.1429397314786911, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0701409578323364, "sampling/importance_sampling_ratio/min": 1.0667208698578179e-06, "sampling/sampling_logp_difference/max": 13.750921249389648, "sampling/sampling_logp_difference/mean": 0.13888150453567505, "step": 1217 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2759207487106323, "epoch": 3.205263157894737, "grad_norm": 0.0068762172013521194, "learning_rate": 1e-06, "loss": 0.0009, "step": 1218 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.27509351074695587, "epoch": 3.207894736842105, "grad_norm": 0.003071358660236001, "learning_rate": 1e-06, "loss": -0.0008, "step": 1219 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.27888280153274536, "epoch": 3.2105263157894735, "grad_norm": 0.0011286369990557432, "learning_rate": 1e-06, "loss": -0.0003, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 308.841796875, "completions/mean_terminated_length": 308.841796875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.2750840187072754, "epoch": 3.213157894736842, "frac_reward_zero_std": 0.6875, "grad_norm": 0.004304279573261738, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 256278798.0, "reward": 0.7308099269866943, "reward_std": 0.051743410527706146, "rewards/progression_diversity/mean": -6.387459143297747e-05, "rewards/progression_diversity/std": 0.0013390847016125917, "rewards/symbolic_reward_accuracy/mean": 0.755859375, "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, "rewards/symbolic_reward_partial_score/mean": 0.92431640625, "rewards/symbolic_reward_partial_score/std": 0.14684316515922546, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0694196224212646, "sampling/importance_sampling_ratio/min": 0.0019063284853473306, "sampling/sampling_logp_difference/max": 6.262576103210449, "sampling/sampling_logp_difference/mean": 0.14004170894622803, "step": 1221 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.27862994372844696, "epoch": 3.2157894736842105, "grad_norm": 0.002798795234411955, "learning_rate": 1e-06, "loss": -0.0002, "step": 1222 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2836599797010422, "epoch": 3.218421052631579, "grad_norm": 0.0013353745453059673, "learning_rate": 1e-06, "loss": -0.0013, "step": 1223 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.28007832169532776, "epoch": 3.221052631578947, "grad_norm": 0.00800714548677206, "learning_rate": 1e-06, "loss": 0.0011, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1284.0, "completions/max_terminated_length": 1284.0, "completions/mean_length": 305.09765625, "completions/mean_terminated_length": 305.09765625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.2768820822238922, "epoch": 3.223684210526316, "frac_reward_zero_std": 0.78125, "grad_norm": 0.007991495542228222, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 256848000.0, "reward": 0.8162106275558472, "reward_std": 0.038142770528793335, "rewards/progression_diversity/mean": -3.331673360662535e-05, "rewards/progression_diversity/std": 0.0007538715726695955, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.955078125, "rewards/symbolic_reward_partial_score/std": 0.1319531947374344, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0698317289352417, "sampling/importance_sampling_ratio/min": 4.832327613257803e-05, "sampling/sampling_logp_difference/max": 9.937597274780273, "sampling/sampling_logp_difference/mean": 0.13788989186286926, "step": 1225 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.274690717458725, "epoch": 3.2263157894736842, "grad_norm": 0.006914569530636072, "learning_rate": 1e-06, "loss": 0.0008, "step": 1226 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.27500303089618683, "epoch": 3.2289473684210526, "grad_norm": 0.0019397508585825562, "learning_rate": 1e-06, "loss": 0.0005, "step": 1227 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.277148962020874, "epoch": 3.231578947368421, "grad_norm": 0.002220799447968602, "learning_rate": 1e-06, "loss": -0.0009, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 297.98828125, "completions/mean_terminated_length": 297.98828125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2688996493816376, "epoch": 3.2342105263157896, "frac_reward_zero_std": 0.78125, "grad_norm": 0.005563766695559025, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 257413402.0, "reward": 0.7893544435501099, "reward_std": 0.037740692496299744, "rewards/progression_diversity/mean": -0.00010110770381288603, "rewards/progression_diversity/std": 0.0022878062445670366, "rewards/symbolic_reward_accuracy/mean": 0.841796875, "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, "rewards/symbolic_reward_partial_score/mean": 0.9475911855697632, "rewards/symbolic_reward_partial_score/std": 0.1305176019668579, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.069476842880249, "sampling/importance_sampling_ratio/min": 0.00020935774955432862, "sampling/sampling_logp_difference/max": 8.471466064453125, "sampling/sampling_logp_difference/mean": 0.1374514102935791, "step": 1229 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2761152386665344, "epoch": 3.236842105263158, "grad_norm": 0.004724735394120216, "learning_rate": 1e-06, "loss": 0.0004, "step": 1230 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.27613112330436707, "epoch": 3.2394736842105263, "grad_norm": 0.0010733611416071653, "learning_rate": 1e-06, "loss": 0.0009, "step": 1231 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.276006281375885, "epoch": 3.2421052631578946, "grad_norm": 0.004941441584378481, "learning_rate": 1e-06, "loss": -0.0007, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 300.60546875, "completions/mean_terminated_length": 300.60546875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2708422541618347, "epoch": 3.2447368421052634, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0021209348924458027, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 257974288.0, "reward": 0.8088867664337158, "reward_std": 0.04001738876104355, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.86328125, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.9697265625, "rewards/symbolic_reward_partial_score/std": 0.07978670299053192, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0682693719863892, "sampling/importance_sampling_ratio/min": 0.0019491848070174456, "sampling/sampling_logp_difference/max": 6.240344047546387, "sampling/sampling_logp_difference/mean": 0.13673865795135498, "step": 1233 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.27098171412944794, "epoch": 3.2473684210526317, "grad_norm": 0.005656884983181953, "learning_rate": 1e-06, "loss": -0.0, "step": 1234 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.27767522633075714, "epoch": 3.25, "grad_norm": 0.0009761822293512523, "learning_rate": 1e-06, "loss": 0.0013, "step": 1235 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.26963022351264954, "epoch": 3.2526315789473683, "grad_norm": 0.0069908928126096725, "learning_rate": 1e-06, "loss": 0.0006, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 300.423828125, "completions/mean_terminated_length": 300.423828125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.2717629671096802, "epoch": 3.2552631578947366, "frac_reward_zero_std": 0.875, "grad_norm": 0.006111988332122564, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 258553929.0, "reward": 0.7229492664337158, "reward_std": 0.02226562611758709, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.75, "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, "rewards/symbolic_reward_partial_score/mean": 0.9098306894302368, "rewards/symbolic_reward_partial_score/std": 0.17206500470638275, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0691936016082764, "sampling/importance_sampling_ratio/min": 0.00034812605008482933, "sampling/sampling_logp_difference/max": 7.962945938110352, "sampling/sampling_logp_difference/mean": 0.13795682787895203, "step": 1237 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.26943157613277435, "epoch": 3.2578947368421054, "grad_norm": 0.0070730592124164104, "learning_rate": 1e-06, "loss": -0.0008, "step": 1238 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2729327529668808, "epoch": 3.2605263157894737, "grad_norm": 0.000548347074072808, "learning_rate": 1e-06, "loss": -0.0, "step": 1239 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.27310749888420105, "epoch": 3.263157894736842, "grad_norm": 0.0007199924439191818, "learning_rate": 1e-06, "loss": 0.0008, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 299.822265625, "completions/mean_terminated_length": 299.822265625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2718695104122162, "epoch": 3.2657894736842104, "frac_reward_zero_std": 0.8125, "grad_norm": 0.007086692377924919, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 259099214.0, "reward": 0.7880857586860657, "reward_std": 0.02265714481472969, "rewards/progression_diversity/mean": -2.2376687411451712e-05, "rewards/progression_diversity/std": 0.0005063266144134104, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.955078125, "rewards/symbolic_reward_partial_score/std": 0.1139356791973114, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0676381587982178, "sampling/importance_sampling_ratio/min": 0.0004350260423962027, "sampling/sampling_logp_difference/max": 7.740104675292969, "sampling/sampling_logp_difference/mean": 0.13587021827697754, "step": 1241 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2723766714334488, "epoch": 3.268421052631579, "grad_norm": 0.000981443445198238, "learning_rate": 1e-06, "loss": -0.0, "step": 1242 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2734517604112625, "epoch": 3.2710526315789474, "grad_norm": 0.0007923005032353103, "learning_rate": 1e-06, "loss": 0.0001, "step": 1243 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2730279564857483, "epoch": 3.2736842105263158, "grad_norm": 0.01129809208214283, "learning_rate": 1e-06, "loss": 0.0003, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 296.421875, "completions/mean_terminated_length": 296.421875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.2747436612844467, "epoch": 3.276315789473684, "frac_reward_zero_std": 0.59375, "grad_norm": 0.011831044219434261, "learning_rate": 1e-06, "loss": 0.0022, "num_tokens": 259648294.0, "reward": 0.6915037631988525, "reward_std": 0.07890917360782623, "rewards/progression_diversity/mean": -1.475131830375176e-05, "rewards/progression_diversity/std": 0.00033378423540852964, "rewards/symbolic_reward_accuracy/mean": 0.7109375, "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, "rewards/symbolic_reward_partial_score/mean": 0.8831380605697632, "rewards/symbolic_reward_partial_score/std": 0.19753031432628632, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0684700012207031, "sampling/importance_sampling_ratio/min": 6.384307926055044e-05, "sampling/sampling_logp_difference/max": 9.659082412719727, "sampling/sampling_logp_difference/mean": 0.13689962029457092, "step": 1245 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.203125, "entropy": 0.27058111131191254, "epoch": 3.2789473684210524, "grad_norm": 0.0020035523921251297, "learning_rate": 1e-06, "loss": 0.0001, "step": 1246 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2713269144296646, "epoch": 3.281578947368421, "grad_norm": 0.004604648798704147, "learning_rate": 1e-06, "loss": -0.0008, "step": 1247 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2719510495662689, "epoch": 3.2842105263157895, "grad_norm": 0.0033603121992200613, "learning_rate": 1e-06, "loss": -0.001, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 297.6171875, "completions/mean_terminated_length": 297.6171875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.26710009574890137, "epoch": 3.286842105263158, "frac_reward_zero_std": 0.90625, "grad_norm": 0.000993250752799213, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 260195138.0, "reward": 0.7834961414337158, "reward_std": 0.012673637829720974, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.9358723759651184, "rewards/symbolic_reward_partial_score/std": 0.16824129223823547, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0676732063293457, "sampling/importance_sampling_ratio/min": 4.49665293444923e-07, "sampling/sampling_logp_difference/max": 14.614762306213379, "sampling/sampling_logp_difference/mean": 0.13557732105255127, "step": 1249 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2676374465227127, "epoch": 3.2894736842105265, "grad_norm": 0.0007264050655066967, "learning_rate": 1e-06, "loss": 0.001, "step": 1250 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.270687460899353, "epoch": 3.292105263157895, "grad_norm": 0.0006257990025915205, "learning_rate": 1e-06, "loss": -0.0006, "step": 1251 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.26940611004829407, "epoch": 3.294736842105263, "grad_norm": 0.0004618112579919398, "learning_rate": 1e-06, "loss": 0.0006, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 300.1953125, "completions/mean_terminated_length": 300.1953125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.2769618481397629, "epoch": 3.2973684210526315, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0007221846026368439, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 260752646.0, "reward": 0.7583496570587158, "reward_std": 0.01035156287252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.80859375, "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, "rewards/symbolic_reward_partial_score/mean": 0.91064453125, "rewards/symbolic_reward_partial_score/std": 0.2116525024175644, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0673704147338867, "sampling/importance_sampling_ratio/min": 0.0001698398555163294, "sampling/sampling_logp_difference/max": 8.680654525756836, "sampling/sampling_logp_difference/mean": 0.13527140021324158, "step": 1253 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2640591859817505, "epoch": 3.3, "grad_norm": 0.0006942602340131998, "learning_rate": 1e-06, "loss": -0.0003, "step": 1254 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2654099464416504, "epoch": 3.3026315789473686, "grad_norm": 0.0003709284064825624, "learning_rate": 1e-06, "loss": -0.0003, "step": 1255 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2712586522102356, "epoch": 3.305263157894737, "grad_norm": 0.006947053596377373, "learning_rate": 1e-06, "loss": 0.0008, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 301.51953125, "completions/mean_terminated_length": 301.51953125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.26971058547496796, "epoch": 3.307894736842105, "frac_reward_zero_std": 0.875, "grad_norm": 0.0009350181207992136, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 261279792.0, "reward": 0.782421886920929, "reward_std": 0.0234375037252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.9361978769302368, "rewards/symbolic_reward_partial_score/std": 0.1689293086528778, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0669386386871338, "sampling/importance_sampling_ratio/min": 5.623637093776779e-07, "sampling/sampling_logp_difference/max": 14.391117095947266, "sampling/sampling_logp_difference/mean": 0.13500767946243286, "step": 1257 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2693052887916565, "epoch": 3.3105263157894735, "grad_norm": 0.0008573352824896574, "learning_rate": 1e-06, "loss": 0.0003, "step": 1258 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2674746811389923, "epoch": 3.3131578947368423, "grad_norm": 0.008638602681457996, "learning_rate": 1e-06, "loss": 0.0006, "step": 1259 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2724864184856415, "epoch": 3.3157894736842106, "grad_norm": 0.0008081798441708088, "learning_rate": 1e-06, "loss": 0.0002, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 331.048828125, "completions/mean_terminated_length": 299.6340637207031, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.26643824577331543, "epoch": 3.318421052631579, "frac_reward_zero_std": 0.8125, "grad_norm": 0.003980845678597689, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 261877801.0, "reward": 0.764533281326294, "reward_std": 0.02937212586402893, "rewards/progression_diversity/mean": -0.001752678770571947, "rewards/progression_diversity/std": 0.03948704153299332, "rewards/symbolic_reward_accuracy/mean": 0.810546875, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.9280599355697632, "rewards/symbolic_reward_partial_score/std": 0.15806357562541962, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0653882026672363, "sampling/importance_sampling_ratio/min": 0.0031156677287071943, "sampling/sampling_logp_difference/max": 5.7713117599487305, "sampling/sampling_logp_difference/mean": 0.12959060072898865, "step": 1261 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2636767327785492, "epoch": 3.3210526315789473, "grad_norm": 0.0007575162453576922, "learning_rate": 1e-06, "loss": 0.011, "step": 1262 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.26859790086746216, "epoch": 3.3236842105263156, "grad_norm": 0.0006437603733502328, "learning_rate": 1e-06, "loss": -0.0001, "step": 1263 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.26694221794605255, "epoch": 3.3263157894736843, "grad_norm": 0.004774495959281921, "learning_rate": 1e-06, "loss": 0.0005, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 299.962890625, "completions/mean_terminated_length": 299.962890625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.26413773000240326, "epoch": 3.3289473684210527, "frac_reward_zero_std": 0.65625, "grad_norm": 0.009386946447193623, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 262445878.0, "reward": 0.710888683795929, "reward_std": 0.07148764282464981, "rewards/progression_diversity/mean": -2.774343556666281e-06, "rewards/progression_diversity/std": 6.277623469941318e-05, "rewards/symbolic_reward_accuracy/mean": 0.734375, "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, "rewards/symbolic_reward_partial_score/mean": 0.90087890625, "rewards/symbolic_reward_partial_score/std": 0.18419794738292694, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.067664623260498, "sampling/importance_sampling_ratio/min": 0.00022712373174726963, "sampling/sampling_logp_difference/max": 8.390015602111816, "sampling/sampling_logp_difference/mean": 0.13395264744758606, "step": 1265 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2680993378162384, "epoch": 3.331578947368421, "grad_norm": 0.004663961473852396, "learning_rate": 1e-06, "loss": 0.0007, "step": 1266 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2699938416481018, "epoch": 3.3342105263157893, "grad_norm": 0.006105378735810518, "learning_rate": 1e-06, "loss": -0.0008, "step": 1267 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.26628097891807556, "epoch": 3.336842105263158, "grad_norm": 0.001197861391119659, "learning_rate": 1e-06, "loss": 0.0009, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.005859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 388.677734375, "completions/mean_terminated_length": 294.40277099609375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.2528943419456482, "epoch": 3.3394736842105264, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0033977432176470757, "learning_rate": 1e-06, "loss": 0.0351, "num_tokens": 263023825.0, "reward": 0.7753678560256958, "reward_std": 0.027289744466543198, "rewards/progression_diversity/mean": -0.0022837440483272076, "rewards/progression_diversity/std": 0.029835714027285576, "rewards/symbolic_reward_accuracy/mean": 0.8203125, "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, "rewards/symbolic_reward_partial_score/mean": 0.9453125, "rewards/symbolic_reward_partial_score/std": 0.13062210381031036, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0619099140167236, "sampling/importance_sampling_ratio/min": 1.0306473086529877e-05, "sampling/sampling_logp_difference/max": 11.482738494873047, "sampling/sampling_logp_difference/mean": 0.12313318997621536, "step": 1269 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2718654125928879, "epoch": 3.3421052631578947, "grad_norm": 0.0010849793907254934, "learning_rate": 1e-06, "loss": -0.0005, "step": 1270 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2673253118991852, "epoch": 3.344736842105263, "grad_norm": 0.0015741854440420866, "learning_rate": 1e-06, "loss": -0.0005, "step": 1271 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.26140786707401276, "epoch": 3.3473684210526318, "grad_norm": 0.006531618069857359, "learning_rate": 1e-06, "loss": 0.0265, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 291.61328125, "completions/mean_terminated_length": 291.61328125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.26315978169441223, "epoch": 3.35, "frac_reward_zero_std": 0.8125, "grad_norm": 0.008593485690653324, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 263592299.0, "reward": 0.7869629263877869, "reward_std": 0.03779665380716324, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.84375, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.9357096552848816, "rewards/symbolic_reward_partial_score/std": 0.1584857553243637, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0645825862884521, "sampling/importance_sampling_ratio/min": 8.691700713825412e-06, "sampling/sampling_logp_difference/max": 11.653141975402832, "sampling/sampling_logp_difference/mean": 0.1318080574274063, "step": 1273 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2618803530931473, "epoch": 3.3526315789473684, "grad_norm": 0.0023191396612674, "learning_rate": 1e-06, "loss": 0.0012, "step": 1274 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.25944313406944275, "epoch": 3.3552631578947367, "grad_norm": 0.0014273182023316622, "learning_rate": 1e-06, "loss": -0.0007, "step": 1275 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.26387016475200653, "epoch": 3.3578947368421055, "grad_norm": 0.001026469049975276, "learning_rate": 1e-06, "loss": 0.0004, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 285.99609375, "completions/mean_terminated_length": 285.99609375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.26820504665374756, "epoch": 3.360526315789474, "frac_reward_zero_std": 0.75, "grad_norm": 0.0026411775033921003, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 264145865.0, "reward": 0.8200682401657104, "reward_std": 0.048733148723840714, "rewards/progression_diversity/mean": -1.5301797247957438e-05, "rewards/progression_diversity/std": 0.000346240121871233, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9601237177848816, "rewards/symbolic_reward_partial_score/std": 0.11702742427587509, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0657628774642944, "sampling/importance_sampling_ratio/min": 1.4597096651414176e-06, "sampling/sampling_logp_difference/max": 13.437273025512695, "sampling/sampling_logp_difference/mean": 0.13426843285560608, "step": 1277 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.26382070779800415, "epoch": 3.363157894736842, "grad_norm": 0.0014408208662644029, "learning_rate": 1e-06, "loss": -0.0001, "step": 1278 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2653789073228836, "epoch": 3.3657894736842104, "grad_norm": 0.003974873572587967, "learning_rate": 1e-06, "loss": 0.0003, "step": 1279 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.26832902431488037, "epoch": 3.3684210526315788, "grad_norm": 0.0066981143318116665, "learning_rate": 1e-06, "loss": -0.0005, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 289.771484375, "completions/mean_terminated_length": 289.771484375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.26377448439598083, "epoch": 3.3710526315789475, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0039009610190987587, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 264702388.0, "reward": 0.7869141101837158, "reward_std": 0.05072927474975586, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8359375, "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, "rewards/symbolic_reward_partial_score/mean": 0.951171875, "rewards/symbolic_reward_partial_score/std": 0.12138548493385315, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.066290259361267, "sampling/importance_sampling_ratio/min": 4.976352101948578e-06, "sampling/sampling_logp_difference/max": 12.210813522338867, "sampling/sampling_logp_difference/mean": 0.13322612643241882, "step": 1281 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.26462262868881226, "epoch": 3.373684210526316, "grad_norm": 0.003261660458520055, "learning_rate": 1e-06, "loss": -0.0008, "step": 1282 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2681008577346802, "epoch": 3.376315789473684, "grad_norm": 0.0013276836834847927, "learning_rate": 1e-06, "loss": 0.0002, "step": 1283 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.26759590208530426, "epoch": 3.3789473684210525, "grad_norm": 0.0027356266509741545, "learning_rate": 1e-06, "loss": 0.0004, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 293.91796875, "completions/mean_terminated_length": 293.91796875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.26821961998939514, "epoch": 3.3815789473684212, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0032564904540777206, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 265264362.0, "reward": 0.8073240518569946, "reward_std": 0.05884532630443573, "rewards/progression_diversity/mean": -1.5981662727426738e-05, "rewards/progression_diversity/std": 0.0003616237663663924, "rewards/symbolic_reward_accuracy/mean": 0.875, "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, "rewards/symbolic_reward_partial_score/mean": 0.9410806894302368, "rewards/symbolic_reward_partial_score/std": 0.17385253310203552, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0658769607543945, "sampling/importance_sampling_ratio/min": 5.955379037914099e-06, "sampling/sampling_logp_difference/max": 12.03121566772461, "sampling/sampling_logp_difference/mean": 0.13309326767921448, "step": 1285 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2667364180088043, "epoch": 3.3842105263157896, "grad_norm": 0.006176314316689968, "learning_rate": 1e-06, "loss": 0.0002, "step": 1286 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2665433883666992, "epoch": 3.386842105263158, "grad_norm": 0.0025707941967993975, "learning_rate": 1e-06, "loss": -0.0004, "step": 1287 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2672065645456314, "epoch": 3.389473684210526, "grad_norm": 0.0027666857931762934, "learning_rate": 1e-06, "loss": 0.0007, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 291.099609375, "completions/mean_terminated_length": 291.099609375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.26022814214229584, "epoch": 3.3921052631578945, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0038764202035963535, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 265834237.0, "reward": 0.7874018549919128, "reward_std": 0.04602134972810745, "rewards/progression_diversity/mean": -5.456768849398941e-05, "rewards/progression_diversity/std": 0.0012347258161753416, "rewards/symbolic_reward_accuracy/mean": 0.841796875, "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, "rewards/symbolic_reward_partial_score/mean": 0.9410807490348816, "rewards/symbolic_reward_partial_score/std": 0.14504456520080566, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065362572669983, "sampling/importance_sampling_ratio/min": 9.59261305979453e-05, "sampling/sampling_logp_difference/max": 9.251932144165039, "sampling/sampling_logp_difference/mean": 0.13144491612911224, "step": 1289 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2617509812116623, "epoch": 3.3947368421052633, "grad_norm": 0.0021487956400960684, "learning_rate": 1e-06, "loss": 0.0009, "step": 1290 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.261522576212883, "epoch": 3.3973684210526316, "grad_norm": 0.006694071926176548, "learning_rate": 1e-06, "loss": -0.0005, "step": 1291 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.26264551281929016, "epoch": 3.4, "grad_norm": 0.001856036949902773, "learning_rate": 1e-06, "loss": -0.0006, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 288.107421875, "completions/mean_terminated_length": 288.107421875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2683214247226715, "epoch": 3.4026315789473682, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0031836843118071556, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 266343668.0, "reward": 0.8083491325378418, "reward_std": 0.03162518888711929, "rewards/progression_diversity/mean": -4.605756112141535e-05, "rewards/progression_diversity/std": 0.0007501619402319193, "rewards/symbolic_reward_accuracy/mean": 0.86328125, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.9679361581802368, "rewards/symbolic_reward_partial_score/std": 0.08607691526412964, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0654386281967163, "sampling/importance_sampling_ratio/min": 0.007917046546936035, "sampling/sampling_logp_difference/max": 4.8387370109558105, "sampling/sampling_logp_difference/mean": 0.13370326161384583, "step": 1293 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.27221499383449554, "epoch": 3.405263157894737, "grad_norm": 0.002062569372355938, "learning_rate": 1e-06, "loss": -0.0001, "step": 1294 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2661845535039902, "epoch": 3.4078947368421053, "grad_norm": 0.0034429843071848154, "learning_rate": 1e-06, "loss": 0.0004, "step": 1295 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2656361758708954, "epoch": 3.4105263157894736, "grad_norm": 0.0014259653398767114, "learning_rate": 1e-06, "loss": 0.0007, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 316.431640625, "completions/mean_terminated_length": 284.9882507324219, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.26236018538475037, "epoch": 3.413157894736842, "frac_reward_zero_std": 0.875, "grad_norm": 0.0020829113200306892, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 266900465.0, "reward": 0.7546237707138062, "reward_std": 0.02533288300037384, "rewards/progression_diversity/mean": -0.001499977894127369, "rewards/progression_diversity/std": 0.0339406281709671, "rewards/symbolic_reward_accuracy/mean": 0.7890625, "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, "rewards/symbolic_reward_partial_score/mean": 0.9373372793197632, "rewards/symbolic_reward_partial_score/std": 0.1312447488307953, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0632517337799072, "sampling/importance_sampling_ratio/min": 9.111730037147936e-07, "sampling/sampling_logp_difference/max": 13.908533096313477, "sampling/sampling_logp_difference/mean": 0.12851907312870026, "step": 1297 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.26197686791419983, "epoch": 3.4157894736842107, "grad_norm": 0.001598592847585678, "learning_rate": 1e-06, "loss": -0.0006, "step": 1298 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.25969552993774414, "epoch": 3.418421052631579, "grad_norm": 0.005127459764480591, "learning_rate": 1e-06, "loss": 0.0276, "step": 1299 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2652065008878708, "epoch": 3.4210526315789473, "grad_norm": 0.0016107510309666395, "learning_rate": 1e-06, "loss": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 292.716796875, "completions/mean_terminated_length": 292.716796875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.27297018468379974, "epoch": 3.4236842105263157, "frac_reward_zero_std": 0.75, "grad_norm": 0.004471481777727604, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 267449312.0, "reward": 0.73178631067276, "reward_std": 0.05781938135623932, "rewards/progression_diversity/mean": -8.318589971167967e-05, "rewards/progression_diversity/std": 0.0018822819693014026, "rewards/symbolic_reward_accuracy/mean": 0.759765625, "rewards/symbolic_reward_accuracy/std": 0.4276435375213623, "rewards/symbolic_reward_partial_score/mean": 0.9197591543197632, "rewards/symbolic_reward_partial_score/std": 0.1563287228345871, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0674350261688232, "sampling/importance_sampling_ratio/min": 0.0004979167133569717, "sampling/sampling_logp_difference/max": 7.605077743530273, "sampling/sampling_logp_difference/mean": 0.13303379714488983, "step": 1301 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2664797157049179, "epoch": 3.4263157894736844, "grad_norm": 0.0027420800179243088, "learning_rate": 1e-06, "loss": -0.0009, "step": 1302 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2613489329814911, "epoch": 3.4289473684210527, "grad_norm": 0.002885916270315647, "learning_rate": 1e-06, "loss": -0.0001, "step": 1303 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2659630626440048, "epoch": 3.431578947368421, "grad_norm": 0.0036468280013650656, "learning_rate": 1e-06, "loss": -0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 287.36328125, "completions/mean_terminated_length": 287.36328125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.26053567230701447, "epoch": 3.4342105263157894, "frac_reward_zero_std": 0.78125, "grad_norm": 0.005564328748732805, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 268009434.0, "reward": 0.7170894742012024, "reward_std": 0.02636650763452053, "rewards/progression_diversity/mean": -4.2535000829957426e-05, "rewards/progression_diversity/std": 0.0006822184659540653, "rewards/symbolic_reward_accuracy/mean": 0.740234375, "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, "rewards/symbolic_reward_partial_score/mean": 0.9098306894302368, "rewards/symbolic_reward_partial_score/std": 0.16496829688549042, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0655784606933594, "sampling/importance_sampling_ratio/min": 5.745126145484392e-06, "sampling/sampling_logp_difference/max": 12.067158699035645, "sampling/sampling_logp_difference/mean": 0.13085705041885376, "step": 1305 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.26320306956768036, "epoch": 3.4368421052631577, "grad_norm": 0.004667403642088175, "learning_rate": 1e-06, "loss": 0.001, "step": 1306 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.261497437953949, "epoch": 3.4394736842105265, "grad_norm": 0.0008139449637383223, "learning_rate": 1e-06, "loss": -0.001, "step": 1307 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.25740575790405273, "epoch": 3.442105263157895, "grad_norm": 0.001273100497201085, "learning_rate": 1e-06, "loss": -0.0007, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 286.341796875, "completions/mean_terminated_length": 286.341796875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.26350516080856323, "epoch": 3.444736842105263, "frac_reward_zero_std": 0.90625, "grad_norm": 0.008655845187604427, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 268549001.0, "reward": 0.810546875, "reward_std": 0.01718750223517418, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.869140625, "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, "rewards/symbolic_reward_partial_score/mean": 0.9635416269302368, "rewards/symbolic_reward_partial_score/std": 0.1107163056731224, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0654571056365967, "sampling/importance_sampling_ratio/min": 2.639793456182815e-05, "sampling/sampling_logp_difference/max": 10.542224884033203, "sampling/sampling_logp_difference/mean": 0.13366496562957764, "step": 1309 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.26279450953006744, "epoch": 3.4473684210526314, "grad_norm": 0.00044046566472388804, "learning_rate": 1e-06, "loss": 0.0001, "step": 1310 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.26690690219402313, "epoch": 3.45, "grad_norm": 0.0007240689010359347, "learning_rate": 1e-06, "loss": -0.0004, "step": 1311 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2649247348308563, "epoch": 3.4526315789473685, "grad_norm": 0.0007236701203510165, "learning_rate": 1e-06, "loss": -0.0005, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 289.13671875, "completions/mean_terminated_length": 289.13671875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.26410022377967834, "epoch": 3.455263157894737, "frac_reward_zero_std": 0.71875, "grad_norm": 0.004910608287900686, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 269110191.0, "reward": 0.803564190864563, "reward_std": 0.04589978978037834, "rewards/progression_diversity/mean": -3.375462256371975e-05, "rewards/progression_diversity/std": 0.0007637799135409296, "rewards/symbolic_reward_accuracy/mean": 0.86328125, "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, "rewards/symbolic_reward_partial_score/mean": 0.9519856572151184, "rewards/symbolic_reward_partial_score/std": 0.13935703039169312, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0666524171829224, "sampling/importance_sampling_ratio/min": 0.0037739586550742388, "sampling/sampling_logp_difference/max": 5.5796308517456055, "sampling/sampling_logp_difference/mean": 0.1313626617193222, "step": 1313 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2641823887825012, "epoch": 3.457894736842105, "grad_norm": 0.0013223750283941627, "learning_rate": 1e-06, "loss": 0.0003, "step": 1314 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2643107920885086, "epoch": 3.4605263157894735, "grad_norm": 0.007209242787212133, "learning_rate": 1e-06, "loss": -0.0, "step": 1315 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2640030235052109, "epoch": 3.463157894736842, "grad_norm": 0.004676418378949165, "learning_rate": 1e-06, "loss": -0.0003, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 284.884765625, "completions/mean_terminated_length": 284.884765625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.2607775628566742, "epoch": 3.4657894736842105, "frac_reward_zero_std": 0.875, "grad_norm": 0.000587392773013562, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 269647188.0, "reward": 0.7455061674118042, "reward_std": 0.005475577898323536, "rewards/progression_diversity/mean": -0.0001706962939351797, "rewards/progression_diversity/std": 0.002271100878715515, "rewards/symbolic_reward_accuracy/mean": 0.779296875, "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, "rewards/symbolic_reward_partial_score/mean": 0.9264323115348816, "rewards/symbolic_reward_partial_score/std": 0.1475026160478592, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065974473953247, "sampling/importance_sampling_ratio/min": 0.00015449296915903687, "sampling/sampling_logp_difference/max": 8.775362014770508, "sampling/sampling_logp_difference/mean": 0.13090203702449799, "step": 1317 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2620484530925751, "epoch": 3.468421052631579, "grad_norm": 0.0020306191872805357, "learning_rate": 1e-06, "loss": 0.0002, "step": 1318 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.262591689825058, "epoch": 3.4710526315789476, "grad_norm": 0.00032434010063298047, "learning_rate": 1e-06, "loss": 0.0002, "step": 1319 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.26363037526607513, "epoch": 3.473684210526316, "grad_norm": 0.0005222202162258327, "learning_rate": 1e-06, "loss": -0.0002, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 286.896484375, "completions/mean_terminated_length": 286.896484375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "entropy": 0.2641294151544571, "epoch": 3.4763157894736842, "frac_reward_zero_std": 0.78125, "grad_norm": 0.005580322351306677, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 270190207.0, "reward": 0.759863018989563, "reward_std": 0.06166636943817139, "rewards/progression_diversity/mean": -2.578865496616345e-05, "rewards/progression_diversity/std": 0.0005835306365042925, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.9195963144302368, "rewards/symbolic_reward_partial_score/std": 0.17569199204444885, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0666582584381104, "sampling/importance_sampling_ratio/min": 0.00020486714493017644, "sampling/sampling_logp_difference/max": 8.493148803710938, "sampling/sampling_logp_difference/mean": 0.13263538479804993, "step": 1321 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.26649659872055054, "epoch": 3.4789473684210526, "grad_norm": 0.004749960731714964, "learning_rate": 1e-06, "loss": 0.0005, "step": 1322 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2594447731971741, "epoch": 3.481578947368421, "grad_norm": 0.0007304165628738701, "learning_rate": 1e-06, "loss": -0.0008, "step": 1323 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.26071953773498535, "epoch": 3.4842105263157896, "grad_norm": 0.005144048947840929, "learning_rate": 1e-06, "loss": 0.001, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 283.166015625, "completions/mean_terminated_length": 283.166015625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.26111868023872375, "epoch": 3.486842105263158, "frac_reward_zero_std": 0.75, "grad_norm": 0.013651230372488499, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 270757684.0, "reward": 0.7337884902954102, "reward_std": 0.054217878729104996, "rewards/progression_diversity/mean": -6.13354059169069e-05, "rewards/progression_diversity/std": 0.001387861673720181, "rewards/symbolic_reward_accuracy/mean": 0.76171875, "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, "rewards/symbolic_reward_partial_score/mean": 0.9225260019302368, "rewards/symbolic_reward_partial_score/std": 0.16891297698020935, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0644073486328125, "sampling/importance_sampling_ratio/min": 4.552837395976894e-08, "sampling/sampling_logp_difference/max": 16.904930114746094, "sampling/sampling_logp_difference/mean": 0.1341029703617096, "step": 1325 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2623433470726013, "epoch": 3.4894736842105263, "grad_norm": 0.004322811029851437, "learning_rate": 1e-06, "loss": -0.0001, "step": 1326 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.256848469376564, "epoch": 3.4921052631578946, "grad_norm": 0.007806100882589817, "learning_rate": 1e-06, "loss": 0.0007, "step": 1327 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.25882723927497864, "epoch": 3.4947368421052634, "grad_norm": 0.00808742269873619, "learning_rate": 1e-06, "loss": 0.0001, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 281.03515625, "completions/mean_terminated_length": 281.03515625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.25531578063964844, "epoch": 3.4973684210526317, "frac_reward_zero_std": 0.8125, "grad_norm": 0.001909639686346054, "learning_rate": 1e-06, "loss": -0.0011, "num_tokens": 271302054.0, "reward": 0.8040038347244263, "reward_std": 0.03667568787932396, "rewards/progression_diversity/mean": -7.209268460428575e-06, "rewards/progression_diversity/std": 0.0001631271152291447, "rewards/symbolic_reward_accuracy/mean": 0.859375, "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, "rewards/symbolic_reward_partial_score/mean": 0.9612630605697632, "rewards/symbolic_reward_partial_score/std": 0.09996785968542099, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0645685195922852, "sampling/importance_sampling_ratio/min": 0.0025024518836289644, "sampling/sampling_logp_difference/max": 5.990484237670898, "sampling/sampling_logp_difference/mean": 0.1278265118598938, "step": 1329 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.25595368444919586, "epoch": 3.5, "grad_norm": 0.0006472188397310674, "learning_rate": 1e-06, "loss": -0.0001, "step": 1330 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2598939687013626, "epoch": 3.5026315789473683, "grad_norm": 0.0017913918709382415, "learning_rate": 1e-06, "loss": 0.0011, "step": 1331 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.25719694793224335, "epoch": 3.5052631578947366, "grad_norm": 0.002060194732621312, "learning_rate": 1e-06, "loss": -0.0001, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 283.7265625, "completions/mean_terminated_length": 283.7265625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.2666303217411041, "epoch": 3.5078947368421054, "frac_reward_zero_std": 0.84375, "grad_norm": 0.00912644062191248, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 271841626.0, "reward": 0.7412598133087158, "reward_std": 0.023930229246616364, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.775390625, "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, "rewards/symbolic_reward_partial_score/mean": 0.9200846552848816, "rewards/symbolic_reward_partial_score/std": 0.1616220623254776, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0666561126708984, "sampling/importance_sampling_ratio/min": 0.0017295932630077004, "sampling/sampling_logp_difference/max": 6.359869003295898, "sampling/sampling_logp_difference/mean": 0.13231700658798218, "step": 1333 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.26160600781440735, "epoch": 3.5105263157894737, "grad_norm": 0.0008754784357734025, "learning_rate": 1e-06, "loss": -0.0005, "step": 1334 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2645406574010849, "epoch": 3.513157894736842, "grad_norm": 0.00632048211991787, "learning_rate": 1e-06, "loss": 0.0006, "step": 1335 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.26721756160259247, "epoch": 3.515789473684211, "grad_norm": 0.0029721923638135195, "learning_rate": 1e-06, "loss": 0.0006, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 280.01953125, "completions/mean_terminated_length": 280.01953125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.26314379274845123, "epoch": 3.518421052631579, "frac_reward_zero_std": 0.96875, "grad_norm": 0.0004468945844564587, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 272389124.0, "reward": 0.7737305164337158, "reward_std": 0.005078125279396772, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.810546875, "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, "rewards/symbolic_reward_partial_score/mean": 0.9580078125, "rewards/symbolic_reward_partial_score/std": 0.09104800969362259, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0662033557891846, "sampling/importance_sampling_ratio/min": 0.0034528106916695833, "sampling/sampling_logp_difference/max": 5.668566703796387, "sampling/sampling_logp_difference/mean": 0.13059526681900024, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.262877956032753, "epoch": 3.5210526315789474, "grad_norm": 0.005230678245425224, "learning_rate": 1e-06, "loss": 0.0005, "step": 1338 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.25840798020362854, "epoch": 3.5236842105263158, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0001, "step": 1339 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2623683661222458, "epoch": 3.526315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0001, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 281.23828125, "completions/mean_terminated_length": 281.23828125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.26039692759513855, "epoch": 3.5289473684210524, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0022722186986356974, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 272901054.0, "reward": 0.7767575979232788, "reward_std": 0.0289163701236248, "rewards/progression_diversity/mean": -2.1483450836967677e-05, "rewards/progression_diversity/std": 0.00048611496458761394, "rewards/symbolic_reward_accuracy/mean": 0.82421875, "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, "rewards/symbolic_reward_partial_score/mean": 0.9407552480697632, "rewards/symbolic_reward_partial_score/std": 0.13769857585430145, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.064164400100708, "sampling/importance_sampling_ratio/min": 0.00021925149485468864, "sampling/sampling_logp_difference/max": 8.425291061401367, "sampling/sampling_logp_difference/mean": 0.12925145030021667, "step": 1341 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.26014891266822815, "epoch": 3.531578947368421, "grad_norm": 0.0019025214714929461, "learning_rate": 1e-06, "loss": -0.0003, "step": 1342 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.26306456327438354, "epoch": 3.5342105263157895, "grad_norm": 0.00047899127821438015, "learning_rate": 1e-06, "loss": 0.0012, "step": 1343 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.25312456488609314, "epoch": 3.536842105263158, "grad_norm": 0.006323935464024544, "learning_rate": 1e-06, "loss": -0.0009, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 272.009765625, "completions/mean_terminated_length": 272.009765625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "entropy": 0.26376163959503174, "epoch": 3.5394736842105265, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0009184160153381526, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 273438275.0, "reward": 0.79150390625, "reward_std": 0.019352849572896957, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.9430338144302368, "rewards/symbolic_reward_partial_score/std": 0.15738774836063385, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0648845434188843, "sampling/importance_sampling_ratio/min": 0.004287427756935358, "sampling/sampling_logp_difference/max": 5.452068328857422, "sampling/sampling_logp_difference/mean": 0.130482017993927, "step": 1345 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.26225438714027405, "epoch": 3.542105263157895, "grad_norm": 0.0004132894682697952, "learning_rate": 1e-06, "loss": 0.0008, "step": 1346 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2599750906229019, "epoch": 3.544736842105263, "grad_norm": 0.0004837804299313575, "learning_rate": 1e-06, "loss": -0.0003, "step": 1347 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2639758139848709, "epoch": 3.5473684210526315, "grad_norm": 0.003910145256668329, "learning_rate": 1e-06, "loss": -0.0006, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 282.12109375, "completions/mean_terminated_length": 282.12109375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2593047469854355, "epoch": 3.55, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0070974016562104225, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 273977185.0, "reward": 0.7821289300918579, "reward_std": 0.03948177397251129, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.837890625, "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, "rewards/symbolic_reward_partial_score/mean": 0.9313150644302368, "rewards/symbolic_reward_partial_score/std": 0.17086125910282135, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0641109943389893, "sampling/importance_sampling_ratio/min": 0.0002722894714679569, "sampling/sampling_logp_difference/max": 8.20864486694336, "sampling/sampling_logp_difference/mean": 0.13093721866607666, "step": 1349 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.26446548104286194, "epoch": 3.5526315789473686, "grad_norm": 0.0035018729977309704, "learning_rate": 1e-06, "loss": -0.0007, "step": 1350 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.25814053416252136, "epoch": 3.555263157894737, "grad_norm": 0.00219136499799788, "learning_rate": 1e-06, "loss": -0.0005, "step": 1351 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2600250542163849, "epoch": 3.557894736842105, "grad_norm": 0.0010674957884475589, "learning_rate": 1e-06, "loss": 0.0006, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 276.349609375, "completions/mean_terminated_length": 276.349609375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.2562939524650574, "epoch": 3.5605263157894735, "frac_reward_zero_std": 0.875, "grad_norm": 0.0012435732642188668, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 274526996.0, "reward": 0.7630362510681152, "reward_std": 0.016148678958415985, "rewards/progression_diversity/mean": -8.660568710183725e-05, "rewards/progression_diversity/std": 0.0017533308127894998, "rewards/symbolic_reward_accuracy/mean": 0.802734375, "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, "rewards/symbolic_reward_partial_score/mean": 0.93798828125, "rewards/symbolic_reward_partial_score/std": 0.14083407819271088, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0641676187515259, "sampling/importance_sampling_ratio/min": 7.245798315125285e-06, "sampling/sampling_logp_difference/max": 11.835088729858398, "sampling/sampling_logp_difference/mean": 0.13085180521011353, "step": 1353 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.26552921533584595, "epoch": 3.5631578947368423, "grad_norm": 0.0011110203340649605, "learning_rate": 1e-06, "loss": 0.0009, "step": 1354 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2580649256706238, "epoch": 3.5657894736842106, "grad_norm": 0.0030603648629039526, "learning_rate": 1e-06, "loss": -0.0002, "step": 1355 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.26225745677948, "epoch": 3.568421052631579, "grad_norm": 0.0005105588352307677, "learning_rate": 1e-06, "loss": 0.0004, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 15014.0, "completions/max_terminated_length": 15014.0, "completions/mean_length": 303.72265625, "completions/mean_terminated_length": 303.72265625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2583850473165512, "epoch": 3.5710526315789473, "frac_reward_zero_std": 0.875, "grad_norm": 0.002601654501631856, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 275084966.0, "reward": 0.7722633481025696, "reward_std": 0.02485058829188347, "rewards/progression_diversity/mean": -0.00022908755636308342, "rewards/progression_diversity/std": 0.005183659493923187, "rewards/symbolic_reward_accuracy/mean": 0.822265625, "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, "rewards/symbolic_reward_partial_score/mean": 0.9303385615348816, "rewards/symbolic_reward_partial_score/std": 0.16154132783412933, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0623509883880615, "sampling/importance_sampling_ratio/min": 0.0005308397230692208, "sampling/sampling_logp_difference/max": 7.541050434112549, "sampling/sampling_logp_difference/mean": 0.12857502698898315, "step": 1357 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.257750928401947, "epoch": 3.5736842105263156, "grad_norm": 0.007370724342763424, "learning_rate": 1e-06, "loss": 0.0262, "step": 1358 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.25655198097229004, "epoch": 3.5763157894736843, "grad_norm": 0.0012657454935833812, "learning_rate": 1e-06, "loss": 0.0003, "step": 1359 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2600477933883667, "epoch": 3.5789473684210527, "grad_norm": 0.0008869823068380356, "learning_rate": 1e-06, "loss": 0.0002, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 271.412109375, "completions/mean_terminated_length": 271.412109375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.25901901721954346, "epoch": 3.581578947368421, "frac_reward_zero_std": 0.8125, "grad_norm": 0.004224944394081831, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 275619737.0, "reward": 0.738232433795929, "reward_std": 0.052099138498306274, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.767578125, "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, "rewards/symbolic_reward_partial_score/mean": 0.9256184697151184, "rewards/symbolic_reward_partial_score/std": 0.15123844146728516, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0638682842254639, "sampling/importance_sampling_ratio/min": 0.0003394426894374192, "sampling/sampling_logp_difference/max": 7.988205432891846, "sampling/sampling_logp_difference/mean": 0.13007181882858276, "step": 1361 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2613148093223572, "epoch": 3.5842105263157897, "grad_norm": 0.0035895437467843294, "learning_rate": 1e-06, "loss": 0.0006, "step": 1362 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.26094360649585724, "epoch": 3.586842105263158, "grad_norm": 0.002421436831355095, "learning_rate": 1e-06, "loss": -0.0006, "step": 1363 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2611810863018036, "epoch": 3.5894736842105264, "grad_norm": 0.004130725748836994, "learning_rate": 1e-06, "loss": 0.0004, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 276.9140625, "completions/mean_terminated_length": 276.9140625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.262939915060997, "epoch": 3.5921052631578947, "frac_reward_zero_std": 0.71875, "grad_norm": 0.003355985274538398, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 276174669.0, "reward": 0.7457025051116943, "reward_std": 0.07194074988365173, "rewards/progression_diversity/mean": -6.150803528726101e-05, "rewards/progression_diversity/std": 0.0010658144019544125, "rewards/symbolic_reward_accuracy/mean": 0.783203125, "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, "rewards/symbolic_reward_partial_score/mean": 0.9192708730697632, "rewards/symbolic_reward_partial_score/std": 0.16867269575595856, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0644850730895996, "sampling/importance_sampling_ratio/min": 0.0005842425744049251, "sampling/sampling_logp_difference/max": 7.445194244384766, "sampling/sampling_logp_difference/mean": 0.13014239072799683, "step": 1365 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2584049552679062, "epoch": 3.594736842105263, "grad_norm": 0.008518553338944912, "learning_rate": 1e-06, "loss": 0.0004, "step": 1366 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2618335336446762, "epoch": 3.5973684210526313, "grad_norm": 0.0025971720460802317, "learning_rate": 1e-06, "loss": 0.0007, "step": 1367 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.26184357702732086, "epoch": 3.6, "grad_norm": 0.001905369688756764, "learning_rate": 1e-06, "loss": -0.0009, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 279.400390625, "completions/mean_terminated_length": 279.400390625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.26188020408153534, "epoch": 3.6026315789473684, "frac_reward_zero_std": 0.84375, "grad_norm": 0.004297807812690735, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 276700858.0, "reward": 0.7915037870407104, "reward_std": 0.039244234561920166, "rewards/progression_diversity/mean": -1.5363497368525714e-05, "rewards/progression_diversity/std": 0.00034763626172207296, "rewards/symbolic_reward_accuracy/mean": 0.849609375, "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, "rewards/symbolic_reward_partial_score/mean": 0.9391275644302368, "rewards/symbolic_reward_partial_score/std": 0.15407440066337585, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.065136194229126, "sampling/importance_sampling_ratio/min": 0.0005955998785793781, "sampling/sampling_logp_difference/max": 7.425941467285156, "sampling/sampling_logp_difference/mean": 0.13098467886447906, "step": 1369 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.26105809211730957, "epoch": 3.6052631578947367, "grad_norm": 0.004089301452040672, "learning_rate": 1e-06, "loss": -0.0003, "step": 1370 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2629578411579132, "epoch": 3.6078947368421055, "grad_norm": 0.006167134270071983, "learning_rate": 1e-06, "loss": -0.0005, "step": 1371 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.25889579951763153, "epoch": 3.610526315789474, "grad_norm": 0.006159425713121891, "learning_rate": 1e-06, "loss": 0.0006, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 282.240234375, "completions/mean_terminated_length": 282.240234375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.2573448568582535, "epoch": 3.613157894736842, "frac_reward_zero_std": 0.6875, "grad_norm": 0.011007736437022686, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 277228821.0, "reward": 0.7931638956069946, "reward_std": 0.0751810073852539, "rewards/progression_diversity/mean": -1.619287286303006e-05, "rewards/progression_diversity/std": 0.00036640287726186216, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.9407552480697632, "rewards/symbolic_reward_partial_score/std": 0.15025249123573303, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0642564296722412, "sampling/importance_sampling_ratio/min": 9.635853348299861e-05, "sampling/sampling_logp_difference/max": 9.247434616088867, "sampling/sampling_logp_difference/mean": 0.1293666511774063, "step": 1373 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.26405245065689087, "epoch": 3.6157894736842104, "grad_norm": 0.00313938083127141, "learning_rate": 1e-06, "loss": 0.0005, "step": 1374 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.26752831041812897, "epoch": 3.6184210526315788, "grad_norm": 0.0031525518279522657, "learning_rate": 1e-06, "loss": -0.0003, "step": 1375 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2577017545700073, "epoch": 3.6210526315789475, "grad_norm": 0.002522657625377178, "learning_rate": 1e-06, "loss": -0.0011, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 279.568359375, "completions/mean_terminated_length": 279.568359375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.26175713539123535, "epoch": 3.623684210526316, "frac_reward_zero_std": 0.71875, "grad_norm": 0.004775851499289274, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 277769592.0, "reward": 0.8418935537338257, "reward_std": 0.05553803592920303, "rewards/progression_diversity/mean": -9.922643948812038e-05, "rewards/progression_diversity/std": 0.0015981434844434261, "rewards/symbolic_reward_accuracy/mean": 0.916015625, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.9742838144302368, "rewards/symbolic_reward_partial_score/std": 0.09493894129991531, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0631273984909058, "sampling/importance_sampling_ratio/min": 0.0002650808892212808, "sampling/sampling_logp_difference/max": 8.235475540161133, "sampling/sampling_logp_difference/mean": 0.1295655369758606, "step": 1377 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.25658373534679413, "epoch": 3.626315789473684, "grad_norm": 0.0063101500272750854, "learning_rate": 1e-06, "loss": 0.0005, "step": 1378 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.25902336835861206, "epoch": 3.6289473684210525, "grad_norm": 0.0057982527650892735, "learning_rate": 1e-06, "loss": -0.0, "step": 1379 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.26077018678188324, "epoch": 3.6315789473684212, "grad_norm": 0.005432313308119774, "learning_rate": 1e-06, "loss": -0.0005, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 280.951171875, "completions/mean_terminated_length": 280.951171875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.25891734659671783, "epoch": 3.6342105263157896, "frac_reward_zero_std": 0.8125, "grad_norm": 0.005559414625167847, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 278318751.0, "reward": 0.7943359613418579, "reward_std": 0.03447442501783371, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.9407551884651184, "rewards/symbolic_reward_partial_score/std": 0.1546209156513214, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0632920265197754, "sampling/importance_sampling_ratio/min": 5.388126373873092e-05, "sampling/sampling_logp_difference/max": 9.828727722167969, "sampling/sampling_logp_difference/mean": 0.12783315777778625, "step": 1381 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.25600631535053253, "epoch": 3.636842105263158, "grad_norm": 0.005643780808895826, "learning_rate": 1e-06, "loss": 0.0005, "step": 1382 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2576565742492676, "epoch": 3.639473684210526, "grad_norm": 0.0014839650830253959, "learning_rate": 1e-06, "loss": 0.0002, "step": 1383 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.25885675847530365, "epoch": 3.6421052631578945, "grad_norm": 0.001807388965971768, "learning_rate": 1e-06, "loss": -0.0007, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 284.384765625, "completions/mean_terminated_length": 284.384765625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.25956813991069794, "epoch": 3.6447368421052633, "frac_reward_zero_std": 0.6875, "grad_norm": 0.005248944275081158, "learning_rate": 1e-06, "loss": -0.0016, "num_tokens": 278891204.0, "reward": 0.7840330004692078, "reward_std": 0.05429323390126228, "rewards/progression_diversity/mean": -2.221354043285828e-05, "rewards/progression_diversity/std": 0.000469754304504022, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.9454752206802368, "rewards/symbolic_reward_partial_score/std": 0.13321354985237122, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.063035488128662, "sampling/importance_sampling_ratio/min": 0.00014696984726469964, "sampling/sampling_logp_difference/max": 8.82528305053711, "sampling/sampling_logp_difference/mean": 0.1286298781633377, "step": 1385 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.25760167837142944, "epoch": 3.6473684210526316, "grad_norm": 0.0028908655513077974, "learning_rate": 1e-06, "loss": 0.0017, "step": 1386 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.2583495229482651, "epoch": 3.65, "grad_norm": 0.0017066035652533174, "learning_rate": 1e-06, "loss": -0.0002, "step": 1387 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2530466616153717, "epoch": 3.6526315789473687, "grad_norm": 0.0022362645249813795, "learning_rate": 1e-06, "loss": -0.0002, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 288.814453125, "completions/mean_terminated_length": 288.814453125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.2583690285682678, "epoch": 3.655263157894737, "frac_reward_zero_std": 0.71875, "grad_norm": 0.005710822995752096, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 279461253.0, "reward": 0.7866692543029785, "reward_std": 0.05079422518610954, "rewards/progression_diversity/mean": -7.545309927081689e-05, "rewards/progression_diversity/std": 0.0013873358257114887, "rewards/symbolic_reward_accuracy/mean": 0.83203125, "rewards/symbolic_reward_accuracy/std": 0.374204158782959, "rewards/symbolic_reward_partial_score/mean": 0.9581705331802368, "rewards/symbolic_reward_partial_score/std": 0.10960590839385986, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0640332698822021, "sampling/importance_sampling_ratio/min": 0.0019857485312968493, "sampling/sampling_logp_difference/max": 6.22175931930542, "sampling/sampling_logp_difference/mean": 0.12767165899276733, "step": 1389 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.257061704993248, "epoch": 3.6578947368421053, "grad_norm": 0.004931934643536806, "learning_rate": 1e-06, "loss": -0.0001, "step": 1390 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2629953771829605, "epoch": 3.6605263157894736, "grad_norm": 0.0034470842219889164, "learning_rate": 1e-06, "loss": 0.0008, "step": 1391 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.26158788800239563, "epoch": 3.663157894736842, "grad_norm": 0.0022328526247292757, "learning_rate": 1e-06, "loss": -0.0014, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 286.408203125, "completions/mean_terminated_length": 286.408203125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.2523958906531334, "epoch": 3.6657894736842103, "frac_reward_zero_std": 0.84375, "grad_norm": 0.005374122876673937, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 279998006.0, "reward": 0.8294917345046997, "reward_std": 0.03202075511217117, "rewards/progression_diversity/mean": -4.53010288765654e-05, "rewards/progression_diversity/std": 0.0010250452905893326, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9602864384651184, "rewards/symbolic_reward_partial_score/std": 0.13371798396110535, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0621941089630127, "sampling/importance_sampling_ratio/min": 0.00307788816280663, "sampling/sampling_logp_difference/max": 5.783511638641357, "sampling/sampling_logp_difference/mean": 0.12583598494529724, "step": 1393 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2517199218273163, "epoch": 3.668421052631579, "grad_norm": 0.0023138711694628, "learning_rate": 1e-06, "loss": -0.0001, "step": 1394 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2528526932001114, "epoch": 3.6710526315789473, "grad_norm": 0.003128192387521267, "learning_rate": 1e-06, "loss": -0.0007, "step": 1395 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.25596775114536285, "epoch": 3.6736842105263157, "grad_norm": 0.0009576270822435617, "learning_rate": 1e-06, "loss": -0.0009, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 293.642578125, "completions/mean_terminated_length": 293.642578125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.25471819937229156, "epoch": 3.6763157894736844, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0031971256248652935, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 280548831.0, "reward": 0.8655747175216675, "reward_std": 0.0322401225566864, "rewards/progression_diversity/mean": -0.0001476671895943582, "rewards/progression_diversity/std": 0.0032431327272206545, "rewards/symbolic_reward_accuracy/mean": 0.951171875, "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, "rewards/symbolic_reward_partial_score/mean": 0.98291015625, "rewards/symbolic_reward_partial_score/std": 0.0817250907421112, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062921404838562, "sampling/importance_sampling_ratio/min": 0.0002203639887738973, "sampling/sampling_logp_difference/max": 8.4202299118042, "sampling/sampling_logp_difference/mean": 0.1252584308385849, "step": 1397 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2561066001653671, "epoch": 3.6789473684210527, "grad_norm": 0.004226773511618376, "learning_rate": 1e-06, "loss": 0.0026, "step": 1398 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.25608038902282715, "epoch": 3.681578947368421, "grad_norm": 0.0008029827149584889, "learning_rate": 1e-06, "loss": 0.0, "step": 1399 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.24976499378681183, "epoch": 3.6842105263157894, "grad_norm": 0.0008318249019794166, "learning_rate": 1e-06, "loss": -0.0004, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 291.421875, "completions/mean_terminated_length": 291.421875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "entropy": 0.2544937878847122, "epoch": 3.6868421052631577, "frac_reward_zero_std": 0.8125, "grad_norm": 0.00313924765214324, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 281088311.0, "reward": 0.8208984732627869, "reward_std": 0.05074112489819527, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.890625, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.955078125, "rewards/symbolic_reward_partial_score/std": 0.14979958534240723, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0617702007293701, "sampling/importance_sampling_ratio/min": 1.9943448933190666e-05, "sampling/sampling_logp_difference/max": 10.822609901428223, "sampling/sampling_logp_difference/mean": 0.12716618180274963, "step": 1401 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2559308558702469, "epoch": 3.6894736842105265, "grad_norm": 0.001412192708812654, "learning_rate": 1e-06, "loss": 0.0001, "step": 1402 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.25313878059387207, "epoch": 3.692105263157895, "grad_norm": 0.0063199615105986595, "learning_rate": 1e-06, "loss": -0.0, "step": 1403 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2549208104610443, "epoch": 3.694736842105263, "grad_norm": 0.004130657762289047, "learning_rate": 1e-06, "loss": -0.0003, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 291.791015625, "completions/mean_terminated_length": 291.791015625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.25114137679338455, "epoch": 3.6973684210526314, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0008531836792826653, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 281645004.0, "reward": 0.8064441084861755, "reward_std": 0.011723745614290237, "rewards/progression_diversity/mean": -0.00012486957712098956, "rewards/progression_diversity/std": 0.002825475763529539, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.9459635615348816, "rewards/symbolic_reward_partial_score/std": 0.14375825226306915, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0619075298309326, "sampling/importance_sampling_ratio/min": 0.00024697749176993966, "sampling/sampling_logp_difference/max": 8.30621337890625, "sampling/sampling_logp_difference/mean": 0.12634673714637756, "step": 1405 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.25604428350925446, "epoch": 3.7, "grad_norm": 0.00038085339474491775, "learning_rate": 1e-06, "loss": 0.0004, "step": 1406 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.25144171714782715, "epoch": 3.7026315789473685, "grad_norm": 0.0005420586676336825, "learning_rate": 1e-06, "loss": 0.0008, "step": 1407 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.25223299860954285, "epoch": 3.705263157894737, "grad_norm": 0.0004907060065306723, "learning_rate": 1e-06, "loss": -0.0004, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 291.05078125, "completions/mean_terminated_length": 291.05078125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.24966774135828018, "epoch": 3.707894736842105, "frac_reward_zero_std": 0.78125, "grad_norm": 0.002532323356717825, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 282182150.0, "reward": 0.8462873697280884, "reward_std": 0.05212379992008209, "rewards/progression_diversity/mean": -0.00016891930135898292, "rewards/progression_diversity/std": 0.0038222072180360556, "rewards/symbolic_reward_accuracy/mean": 0.923828125, "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, "rewards/symbolic_reward_partial_score/mean": 0.9733072519302368, "rewards/symbolic_reward_partial_score/std": 0.10344868153333664, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.06183922290802, "sampling/importance_sampling_ratio/min": 0.0015853223158046603, "sampling/sampling_logp_difference/max": 6.446967601776123, "sampling/sampling_logp_difference/mean": 0.12602993845939636, "step": 1409 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2533850073814392, "epoch": 3.7105263157894735, "grad_norm": 0.00075558852404356, "learning_rate": 1e-06, "loss": -0.0002, "step": 1410 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.25219717621803284, "epoch": 3.713157894736842, "grad_norm": 0.004231073893606663, "learning_rate": 1e-06, "loss": 0.0011, "step": 1411 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.25498561561107635, "epoch": 3.7157894736842105, "grad_norm": 0.0037051881663501263, "learning_rate": 1e-06, "loss": -0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 289.005859375, "completions/mean_terminated_length": 289.005859375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.24799177050590515, "epoch": 3.718421052631579, "frac_reward_zero_std": 0.71875, "grad_norm": 0.003672688500955701, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 282737257.0, "reward": 0.7589839696884155, "reward_std": 0.0520077683031559, "rewards/progression_diversity/mean": -3.9446502341888845e-05, "rewards/progression_diversity/std": 0.0007376011344604194, "rewards/symbolic_reward_accuracy/mean": 0.796875, "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, "rewards/symbolic_reward_partial_score/mean": 0.9361979365348816, "rewards/symbolic_reward_partial_score/std": 0.14094945788383484, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0618805885314941, "sampling/importance_sampling_ratio/min": 0.00025599630316719413, "sampling/sampling_logp_difference/max": 8.270347595214844, "sampling/sampling_logp_difference/mean": 0.12418229877948761, "step": 1413 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.2508721351623535, "epoch": 3.7210526315789476, "grad_norm": 0.0015133292181417346, "learning_rate": 1e-06, "loss": -0.0006, "step": 1414 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2473483458161354, "epoch": 3.723684210526316, "grad_norm": 0.003211608622223139, "learning_rate": 1e-06, "loss": 0.0002, "step": 1415 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.2483832836151123, "epoch": 3.7263157894736842, "grad_norm": 0.001318311900831759, "learning_rate": 1e-06, "loss": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 290.224609375, "completions/mean_terminated_length": 290.224609375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.2575960010290146, "epoch": 3.7289473684210526, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0016101644141599536, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 283283164.0, "reward": 0.8690425753593445, "reward_std": 0.04031773656606674, "rewards/progression_diversity/mean": -4.095973417861387e-05, "rewards/progression_diversity/std": 0.0009268129942938685, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9788411855697632, "rewards/symbolic_reward_partial_score/std": 0.10289039462804794, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061889886856079, "sampling/importance_sampling_ratio/min": 6.194894922373351e-06, "sampling/sampling_logp_difference/max": 11.991785049438477, "sampling/sampling_logp_difference/mean": 0.12483149766921997, "step": 1417 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2505711019039154, "epoch": 3.731578947368421, "grad_norm": 0.0009705891134217381, "learning_rate": 1e-06, "loss": 0.0001, "step": 1418 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.25299596786499023, "epoch": 3.734210526315789, "grad_norm": 0.004146168474107981, "learning_rate": 1e-06, "loss": -0.0, "step": 1419 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.25464165210723877, "epoch": 3.736842105263158, "grad_norm": 0.0012070556404069066, "learning_rate": 1e-06, "loss": 0.0007, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 294.47265625, "completions/mean_terminated_length": 294.47265625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "entropy": 0.2556218057870865, "epoch": 3.7394736842105263, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0047198995016515255, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 283829902.0, "reward": 0.7914550304412842, "reward_std": 0.037014544010162354, "rewards/progression_diversity/mean": -1.3498356565833092e-05, "rewards/progression_diversity/std": 0.00030543291359208524, "rewards/symbolic_reward_accuracy/mean": 0.853515625, "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, "rewards/symbolic_reward_partial_score/mean": 0.93115234375, "rewards/symbolic_reward_partial_score/std": 0.17769792675971985, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062889814376831, "sampling/importance_sampling_ratio/min": 0.00024652990396134555, "sampling/sampling_logp_difference/max": 8.308027267456055, "sampling/sampling_logp_difference/mean": 0.12772433459758759, "step": 1421 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2585349977016449, "epoch": 3.7421052631578946, "grad_norm": 0.001490709139034152, "learning_rate": 1e-06, "loss": 0.0009, "step": 1422 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2560439109802246, "epoch": 3.7447368421052634, "grad_norm": 0.0030670189298689365, "learning_rate": 1e-06, "loss": -0.0009, "step": 1423 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2598079591989517, "epoch": 3.7473684210526317, "grad_norm": 0.006503027398139238, "learning_rate": 1e-06, "loss": 0.0012, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 316.50390625, "completions/mean_terminated_length": 285.0606689453125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.24654557555913925, "epoch": 3.75, "frac_reward_zero_std": 0.875, "grad_norm": 0.0053245374001562595, "learning_rate": 1e-06, "loss": 0.0145, "num_tokens": 284389744.0, "reward": 0.8325069546699524, "reward_std": 0.039236586540937424, "rewards/progression_diversity/mean": -0.0012605031952261925, "rewards/progression_diversity/std": 0.028521930798888206, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.9749348759651184, "rewards/symbolic_reward_partial_score/std": 0.09453998506069183, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0596258640289307, "sampling/importance_sampling_ratio/min": 0.0023435140028595924, "sampling/sampling_logp_difference/max": 6.056103706359863, "sampling/sampling_logp_difference/mean": 0.12060326337814331, "step": 1425 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.25124719738960266, "epoch": 3.7526315789473683, "grad_norm": 0.0019301900174468756, "learning_rate": 1e-06, "loss": -0.0, "step": 1426 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2514963522553444, "epoch": 3.7552631578947366, "grad_norm": 0.0022078719921410084, "learning_rate": 1e-06, "loss": 0.0003, "step": 1427 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2516591548919678, "epoch": 3.7578947368421054, "grad_norm": 0.0018776309443637729, "learning_rate": 1e-06, "loss": -0.0002, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 288.1328125, "completions/mean_terminated_length": 288.1328125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.2531207203865051, "epoch": 3.7605263157894737, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0015241794753819704, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 284912020.0, "reward": 0.852148175239563, "reward_std": 0.033069413155317307, "rewards/progression_diversity/mean": -2.7848042009281926e-05, "rewards/progression_diversity/std": 0.0006301292451098561, "rewards/symbolic_reward_accuracy/mean": 0.93359375, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.9733073115348816, "rewards/symbolic_reward_partial_score/std": 0.10501327365636826, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060740351676941, "sampling/importance_sampling_ratio/min": 1.1205984264961444e-06, "sampling/sampling_logp_difference/max": 13.701647758483887, "sampling/sampling_logp_difference/mean": 0.1257631629705429, "step": 1429 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2528574913740158, "epoch": 3.763157894736842, "grad_norm": 0.0046994988806545734, "learning_rate": 1e-06, "loss": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2516646832227707, "epoch": 3.765789473684211, "grad_norm": 0.0015605302760377526, "learning_rate": 1e-06, "loss": 0.0009, "step": 1431 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.25141212344169617, "epoch": 3.768421052631579, "grad_norm": 0.0009051641100086272, "learning_rate": 1e-06, "loss": -0.0004, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 286.068359375, "completions/mean_terminated_length": 286.068359375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.24804198741912842, "epoch": 3.7710526315789474, "frac_reward_zero_std": 0.6875, "grad_norm": 0.004949395544826984, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 285463479.0, "reward": 0.7871561646461487, "reward_std": 0.07125456631183624, "rewards/progression_diversity/mean": -0.0002128158521372825, "rewards/progression_diversity/std": 0.004131709225475788, "rewards/symbolic_reward_accuracy/mean": 0.833984375, "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, "rewards/symbolic_reward_partial_score/mean": 0.9558919668197632, "rewards/symbolic_reward_partial_score/std": 0.1025310531258583, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060426950454712, "sampling/importance_sampling_ratio/min": 0.000660174002405256, "sampling/sampling_logp_difference/max": 7.323007106781006, "sampling/sampling_logp_difference/mean": 0.12435492873191833, "step": 1433 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0859375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.24799808859825134, "epoch": 3.7736842105263158, "grad_norm": 0.011806968599557877, "learning_rate": 1e-06, "loss": -0.0009, "step": 1434 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.24940943717956543, "epoch": 3.776315789473684, "grad_norm": 0.003585018450394273, "learning_rate": 1e-06, "loss": 0.0013, "step": 1435 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.24540776759386063, "epoch": 3.7789473684210524, "grad_norm": 0.006892574485391378, "learning_rate": 1e-06, "loss": -0.0001, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 284.197265625, "completions/mean_terminated_length": 284.197265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.2505408376455307, "epoch": 3.781578947368421, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0029755232390016317, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 286036476.0, "reward": 0.7453613877296448, "reward_std": 0.03177817165851593, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.78125, "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, "rewards/symbolic_reward_partial_score/mean": 0.9220377206802368, "rewards/symbolic_reward_partial_score/std": 0.16653913259506226, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0615108013153076, "sampling/importance_sampling_ratio/min": 0.0015313836047425866, "sampling/sampling_logp_difference/max": 6.481583595275879, "sampling/sampling_logp_difference/mean": 0.12580080330371857, "step": 1437 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.24840565025806427, "epoch": 3.7842105263157895, "grad_norm": 0.001804483006708324, "learning_rate": 1e-06, "loss": -0.0007, "step": 1438 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.25283022224903107, "epoch": 3.786842105263158, "grad_norm": 0.002521629212424159, "learning_rate": 1e-06, "loss": 0.0005, "step": 1439 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2502717077732086, "epoch": 3.7894736842105265, "grad_norm": 0.0006864585448056459, "learning_rate": 1e-06, "loss": -0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 281.96484375, "completions/mean_terminated_length": 281.96484375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.2550983428955078, "epoch": 3.792105263157895, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0033912304788827896, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 286587978.0, "reward": 0.7921856641769409, "reward_std": 0.04257185012102127, "rewards/progression_diversity/mean": -0.00018314375483896583, "rewards/progression_diversity/std": 0.0032831577118486166, "rewards/symbolic_reward_accuracy/mean": 0.84375, "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, "rewards/symbolic_reward_partial_score/mean": 0.953125, "rewards/symbolic_reward_partial_score/std": 0.11922687292098999, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061973214149475, "sampling/importance_sampling_ratio/min": 0.0007712719962000847, "sampling/sampling_logp_difference/max": 7.167469501495361, "sampling/sampling_logp_difference/mean": 0.12495287507772446, "step": 1441 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2525986135005951, "epoch": 3.794736842105263, "grad_norm": 0.0032344043720513582, "learning_rate": 1e-06, "loss": 0.0003, "step": 1442 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2523139417171478, "epoch": 3.7973684210526315, "grad_norm": 0.003199146594852209, "learning_rate": 1e-06, "loss": -0.0005, "step": 1443 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.24989847838878632, "epoch": 3.8, "grad_norm": 0.0028786209877580404, "learning_rate": 1e-06, "loss": 0.0005, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 281.888671875, "completions/mean_terminated_length": 281.888671875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.24845003336668015, "epoch": 3.8026315789473686, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0037182525265961885, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 287115921.0, "reward": 0.8478025197982788, "reward_std": 0.02462989278137684, "rewards/progression_diversity/mean": -2.4990213205455802e-05, "rewards/progression_diversity/std": 0.0005654640262946486, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9744466543197632, "rewards/symbolic_reward_partial_score/std": 0.09773314744234085, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0622966289520264, "sampling/importance_sampling_ratio/min": 0.0008701475453563035, "sampling/sampling_logp_difference/max": 7.046847820281982, "sampling/sampling_logp_difference/mean": 0.12369303405284882, "step": 1445 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2537457197904587, "epoch": 3.805263157894737, "grad_norm": 0.000904034823179245, "learning_rate": 1e-06, "loss": -0.0002, "step": 1446 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.24874110519886017, "epoch": 3.807894736842105, "grad_norm": 0.0007011942798271775, "learning_rate": 1e-06, "loss": -0.0003, "step": 1447 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24791067093610764, "epoch": 3.8105263157894735, "grad_norm": 0.0011079704854637384, "learning_rate": 1e-06, "loss": 0.0002, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 308.1796875, "completions/mean_terminated_length": 276.72015380859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.24869465827941895, "epoch": 3.8131578947368423, "frac_reward_zero_std": 0.78125, "grad_norm": 0.006853095255792141, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 287657805.0, "reward": 0.8094131350517273, "reward_std": 0.04773065820336342, "rewards/progression_diversity/mean": -0.001070742728188634, "rewards/progression_diversity/std": 0.024228140711784363, "rewards/symbolic_reward_accuracy/mean": 0.876953125, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.9441732168197632, "rewards/symbolic_reward_partial_score/std": 0.15558461844921112, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0597963333129883, "sampling/importance_sampling_ratio/min": 0.003406539326533675, "sampling/sampling_logp_difference/max": 5.682058334350586, "sampling/sampling_logp_difference/mean": 0.11979185044765472, "step": 1449 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2493317946791649, "epoch": 3.8157894736842106, "grad_norm": 0.0018536460120230913, "learning_rate": 1e-06, "loss": -0.0006, "step": 1450 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24266890436410904, "epoch": 3.818421052631579, "grad_norm": 0.00839342549443245, "learning_rate": 1e-06, "loss": -0.0005, "step": 1451 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.24180525541305542, "epoch": 3.8210526315789473, "grad_norm": 0.0011780494824051857, "learning_rate": 1e-06, "loss": 0.0278, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 283.44140625, "completions/mean_terminated_length": 283.44140625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2568906396627426, "epoch": 3.8236842105263156, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0014739191392436624, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 288214255.0, "reward": 0.8007813096046448, "reward_std": 0.04135853052139282, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.9583333134651184, "rewards/symbolic_reward_partial_score/std": 0.11808153241872787, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0631709098815918, "sampling/importance_sampling_ratio/min": 0.0010668974136933684, "sampling/sampling_logp_difference/max": 6.843000411987305, "sampling/sampling_logp_difference/mean": 0.1276545524597168, "step": 1453 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.25487686693668365, "epoch": 3.8263157894736843, "grad_norm": 0.007138120010495186, "learning_rate": 1e-06, "loss": 0.0005, "step": 1454 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2546128034591675, "epoch": 3.8289473684210527, "grad_norm": 0.001334945554845035, "learning_rate": 1e-06, "loss": 0.0, "step": 1455 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2549200505018234, "epoch": 3.831578947368421, "grad_norm": 0.0048521822318434715, "learning_rate": 1e-06, "loss": 0.0009, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 277.6171875, "completions/mean_terminated_length": 277.6171875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2548879086971283, "epoch": 3.8342105263157897, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0043244436383247375, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 288753867.0, "reward": 0.76708984375, "reward_std": 0.04385879635810852, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8125, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.9319661259651184, "rewards/symbolic_reward_partial_score/std": 0.15669545531272888, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062173843383789, "sampling/importance_sampling_ratio/min": 0.004508232232183218, "sampling/sampling_logp_difference/max": 5.40185022354126, "sampling/sampling_logp_difference/mean": 0.12663015723228455, "step": 1457 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.24473460018634796, "epoch": 3.836842105263158, "grad_norm": 0.001730249379761517, "learning_rate": 1e-06, "loss": -0.0007, "step": 1458 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2515050768852234, "epoch": 3.8394736842105264, "grad_norm": 0.002126506296917796, "learning_rate": 1e-06, "loss": 0.0006, "step": 1459 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2557496428489685, "epoch": 3.8421052631578947, "grad_norm": 0.004039853345602751, "learning_rate": 1e-06, "loss": -0.0001, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 278.25390625, "completions/mean_terminated_length": 278.25390625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.2558761090040207, "epoch": 3.844736842105263, "frac_reward_zero_std": 0.75, "grad_norm": 0.003505394095554948, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 289297133.0, "reward": 0.8289056420326233, "reward_std": 0.05454582720994949, "rewards/progression_diversity/mean": -6.592483259737492e-05, "rewards/progression_diversity/std": 0.0011662401957437396, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.9622395634651184, "rewards/symbolic_reward_partial_score/std": 0.12198054045438766, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0614289045333862, "sampling/importance_sampling_ratio/min": 0.0010322880698367953, "sampling/sampling_logp_difference/max": 6.875977516174316, "sampling/sampling_logp_difference/mean": 0.1257316619157791, "step": 1461 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2523561343550682, "epoch": 3.8473684210526313, "grad_norm": 0.006407232955098152, "learning_rate": 1e-06, "loss": -0.001, "step": 1462 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.24898836761713028, "epoch": 3.85, "grad_norm": 0.006468737497925758, "learning_rate": 1e-06, "loss": 0.0016, "step": 1463 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.25089330971241, "epoch": 3.8526315789473684, "grad_norm": 0.002999432384967804, "learning_rate": 1e-06, "loss": -0.0005, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 276.68359375, "completions/mean_terminated_length": 276.68359375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2475588619709015, "epoch": 3.8552631578947367, "frac_reward_zero_std": 0.78125, "grad_norm": 0.004880095832049847, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 289843947.0, "reward": 0.8241690397262573, "reward_std": 0.03976859897375107, "rewards/progression_diversity/mean": -8.7072177848313e-05, "rewards/progression_diversity/std": 0.001970218727365136, "rewards/symbolic_reward_accuracy/mean": 0.890625, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.9659830331802368, "rewards/symbolic_reward_partial_score/std": 0.11252343654632568, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0622788667678833, "sampling/importance_sampling_ratio/min": 0.00028998416382819414, "sampling/sampling_logp_difference/max": 8.145684242248535, "sampling/sampling_logp_difference/mean": 0.126849964261055, "step": 1465 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.140625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.2497614100575447, "epoch": 3.8578947368421055, "grad_norm": 0.0029560329858213663, "learning_rate": 1e-06, "loss": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2516980916261673, "epoch": 3.860526315789474, "grad_norm": 0.004405204672366381, "learning_rate": 1e-06, "loss": -0.0006, "step": 1467 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.25012845546007156, "epoch": 3.863157894736842, "grad_norm": 0.0012873845407739282, "learning_rate": 1e-06, "loss": 0.0005, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 278.521484375, "completions/mean_terminated_length": 278.521484375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.2574310898780823, "epoch": 3.8657894736842104, "frac_reward_zero_std": 0.65625, "grad_norm": 0.008448783308267593, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 290384342.0, "reward": 0.82958984375, "reward_std": 0.09687106311321259, "rewards/progression_diversity/mean": -4.6260124690888915e-06, "rewards/progression_diversity/std": 0.00010467470565345138, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9606119394302368, "rewards/symbolic_reward_partial_score/std": 0.13187071681022644, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0626296997070312, "sampling/importance_sampling_ratio/min": 0.0021899458952248096, "sampling/sampling_logp_difference/max": 6.123878479003906, "sampling/sampling_logp_difference/mean": 0.1256691813468933, "step": 1469 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1171875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.24842996895313263, "epoch": 3.8684210526315788, "grad_norm": 0.0027846877928823233, "learning_rate": 1e-06, "loss": -0.0006, "step": 1470 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.24932969361543655, "epoch": 3.8710526315789475, "grad_norm": 0.004896063357591629, "learning_rate": 1e-06, "loss": 0.0004, "step": 1471 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.24926616251468658, "epoch": 3.873684210526316, "grad_norm": 0.0034752595238387585, "learning_rate": 1e-06, "loss": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 278.984375, "completions/mean_terminated_length": 278.984375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.24850311875343323, "epoch": 3.876315789473684, "frac_reward_zero_std": 0.71875, "grad_norm": 0.004292353987693787, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 290960366.0, "reward": 0.7541015148162842, "reward_std": 0.06326388567686081, "rewards/progression_diversity/mean": -1.2574851098179352e-05, "rewards/progression_diversity/std": 0.0002845363924279809, "rewards/symbolic_reward_accuracy/mean": 0.796875, "rewards/symbolic_reward_accuracy/std": 0.4027182459831238, "rewards/symbolic_reward_partial_score/mean": 0.919921875, "rewards/symbolic_reward_partial_score/std": 0.1701854020357132, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0626215934753418, "sampling/importance_sampling_ratio/min": 0.00040142651414498687, "sampling/sampling_logp_difference/max": 7.820486068725586, "sampling/sampling_logp_difference/mean": 0.12611128389835358, "step": 1473 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.24769160896539688, "epoch": 3.8789473684210525, "grad_norm": 0.0028536503668874502, "learning_rate": 1e-06, "loss": 0.0006, "step": 1474 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.2475282847881317, "epoch": 3.8815789473684212, "grad_norm": 0.005000830627977848, "learning_rate": 1e-06, "loss": 0.0005, "step": 1475 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.25134460628032684, "epoch": 3.8842105263157896, "grad_norm": 0.0037172201555222273, "learning_rate": 1e-06, "loss": -0.0004, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 285.150390625, "completions/mean_terminated_length": 285.150390625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.2483498975634575, "epoch": 3.886842105263158, "frac_reward_zero_std": 0.71875, "grad_norm": 0.00587662635371089, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 291525691.0, "reward": 0.7465803027153015, "reward_std": 0.07138986885547638, "rewards/progression_diversity/mean": -0.00017707535880617797, "rewards/progression_diversity/std": 0.004006757866591215, "rewards/symbolic_reward_accuracy/mean": 0.78515625, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.9182943105697632, "rewards/symbolic_reward_partial_score/std": 0.1653488576412201, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0621373653411865, "sampling/importance_sampling_ratio/min": 0.0009335778304375708, "sampling/sampling_logp_difference/max": 6.9764862060546875, "sampling/sampling_logp_difference/mean": 0.12466084212064743, "step": 1477 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1484375, "entropy": 0.2482881024479866, "epoch": 3.889473684210526, "grad_norm": 0.004389946348965168, "learning_rate": 1e-06, "loss": -0.0011, "step": 1478 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.171875, "entropy": 0.24733516573905945, "epoch": 3.8921052631578945, "grad_norm": 0.002358887577429414, "learning_rate": 1e-06, "loss": -0.0002, "step": 1479 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.24862007796764374, "epoch": 3.8947368421052633, "grad_norm": 0.007056436035782099, "learning_rate": 1e-06, "loss": 0.0009, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 312.63671875, "completions/mean_terminated_length": 281.1859130859375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "entropy": 0.24648213386535645, "epoch": 3.8973684210526316, "frac_reward_zero_std": 0.71875, "grad_norm": 0.006561717949807644, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 292103585.0, "reward": 0.8242520689964294, "reward_std": 0.05909993499517441, "rewards/progression_diversity/mean": -0.0015558208106085658, "rewards/progression_diversity/std": 0.0352042093873024, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.9630534052848816, "rewards/symbolic_reward_partial_score/std": 0.12470652163028717, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0595953464508057, "sampling/importance_sampling_ratio/min": 0.0015703764511272311, "sampling/sampling_logp_difference/max": 6.456439971923828, "sampling/sampling_logp_difference/mean": 0.12068993598222733, "step": 1481 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1640625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1875, "entropy": 0.24098284542560577, "epoch": 3.9, "grad_norm": 0.0017995714442804456, "learning_rate": 1e-06, "loss": 0.0108, "step": 1482 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2443038821220398, "epoch": 3.9026315789473687, "grad_norm": 0.004129770677536726, "learning_rate": 1e-06, "loss": 0.0014, "step": 1483 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2413162887096405, "epoch": 3.905263157894737, "grad_norm": 0.00310018309392035, "learning_rate": 1e-06, "loss": -0.0014, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 280.509765625, "completions/mean_terminated_length": 280.509765625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.2457173392176628, "epoch": 3.9078947368421053, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0024882822763174772, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 292652358.0, "reward": 0.8500970005989075, "reward_std": 0.050697989761829376, "rewards/progression_diversity/mean": -7.189810276031494e-05, "rewards/progression_diversity/std": 0.0016268682666122913, "rewards/symbolic_reward_accuracy/mean": 0.927734375, "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, "rewards/symbolic_reward_partial_score/mean": 0.9781900644302368, "rewards/symbolic_reward_partial_score/std": 0.08413910120725632, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0623741149902344, "sampling/importance_sampling_ratio/min": 0.004645559936761856, "sampling/sampling_logp_difference/max": 5.371843338012695, "sampling/sampling_logp_difference/mean": 0.12582653760910034, "step": 1485 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.25171560794115067, "epoch": 3.9105263157894736, "grad_norm": 0.0059862262569367886, "learning_rate": 1e-06, "loss": 0.0007, "step": 1486 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.24698235094547272, "epoch": 3.913157894736842, "grad_norm": 0.0026288467925041914, "learning_rate": 1e-06, "loss": -0.0003, "step": 1487 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2489272803068161, "epoch": 3.9157894736842103, "grad_norm": 0.0037700568791478872, "learning_rate": 1e-06, "loss": -0.0002, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 280.005859375, "completions/mean_terminated_length": 280.005859375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "entropy": 0.24408536404371262, "epoch": 3.918421052631579, "frac_reward_zero_std": 0.84375, "grad_norm": 0.004705451894551516, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 293172681.0, "reward": 0.790283203125, "reward_std": 0.05368012189865112, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.93896484375, "rewards/symbolic_reward_partial_score/std": 0.15343523025512695, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061496376991272, "sampling/importance_sampling_ratio/min": 0.0012954623671248555, "sampling/sampling_logp_difference/max": 6.648887634277344, "sampling/sampling_logp_difference/mean": 0.12389793992042542, "step": 1489 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24940645694732666, "epoch": 3.9210526315789473, "grad_norm": 0.0035800114274024963, "learning_rate": 1e-06, "loss": -0.0008, "step": 1490 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.24737446755170822, "epoch": 3.9236842105263157, "grad_norm": 0.0017300564795732498, "learning_rate": 1e-06, "loss": 0.0003, "step": 1491 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.24045811593532562, "epoch": 3.9263157894736844, "grad_norm": 0.002610960975289345, "learning_rate": 1e-06, "loss": 0.0006, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 346.404296875, "completions/mean_terminated_length": 283.51177978515625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.24617382884025574, "epoch": 3.9289473684210527, "frac_reward_zero_std": 0.53125, "grad_norm": 0.007368352729827166, "learning_rate": 1e-06, "loss": -0.0017, "num_tokens": 293738008.0, "reward": 0.8140473961830139, "reward_std": 0.11378496885299683, "rewards/progression_diversity/mean": -0.0015145066427066922, "rewards/progression_diversity/std": 0.03268362581729889, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9407552480697632, "rewards/symbolic_reward_partial_score/std": 0.17397615313529968, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.058638572692871, "sampling/importance_sampling_ratio/min": 0.0024584669154137373, "sampling/sampling_logp_difference/max": 6.0082173347473145, "sampling/sampling_logp_difference/mean": 0.1195145696401596, "step": 1493 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.2411375194787979, "epoch": 3.931578947368421, "grad_norm": 0.008139763958752155, "learning_rate": 1e-06, "loss": 0.0079, "step": 1494 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.171875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.234375, "entropy": 0.24579878151416779, "epoch": 3.9342105263157894, "grad_norm": 0.006455949507653713, "learning_rate": 1e-06, "loss": 0.0012, "step": 1495 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.15625, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2265625, "entropy": 0.24035992473363876, "epoch": 3.9368421052631577, "grad_norm": 0.006683530285954475, "learning_rate": 1e-06, "loss": 0.0233, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 278.2734375, "completions/mean_terminated_length": 278.2734375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.2454933151602745, "epoch": 3.9394736842105265, "frac_reward_zero_std": 0.875, "grad_norm": 0.0044021462090313435, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 294279620.0, "reward": 0.809814453125, "reward_std": 0.03176013380289078, "rewards/progression_diversity/mean": -2.186113533753087e-06, "rewards/progression_diversity/std": 4.94660998811014e-05, "rewards/symbolic_reward_accuracy/mean": 0.8671875, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.9650065302848816, "rewards/symbolic_reward_partial_score/std": 0.09345835447311401, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061531662940979, "sampling/importance_sampling_ratio/min": 0.0018202782375738025, "sampling/sampling_logp_difference/max": 6.308765888214111, "sampling/sampling_logp_difference/mean": 0.1247473657131195, "step": 1497 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2489335760474205, "epoch": 3.942105263157895, "grad_norm": 0.0007171111647039652, "learning_rate": 1e-06, "loss": -0.0006, "step": 1498 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24430719763040543, "epoch": 3.944736842105263, "grad_norm": 0.0038494495674967766, "learning_rate": 1e-06, "loss": -0.0001, "step": 1499 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.24642930924892426, "epoch": 3.9473684210526314, "grad_norm": 0.0003605918900575489, "learning_rate": 1e-06, "loss": 0.0003, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 278.025390625, "completions/mean_terminated_length": 278.025390625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.24727856367826462, "epoch": 3.95, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0015909909270703793, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 294838289.0, "reward": 0.866308331489563, "reward_std": 0.029097389429807663, "rewards/progression_diversity/mean": -2.9835084205842577e-05, "rewards/progression_diversity/std": 0.000618708087131381, "rewards/symbolic_reward_accuracy/mean": 0.953125, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.9814453125, "rewards/symbolic_reward_partial_score/std": 0.08775121718645096, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0610827207565308, "sampling/importance_sampling_ratio/min": 8.675010030856356e-05, "sampling/sampling_logp_difference/max": 9.352478981018066, "sampling/sampling_logp_difference/mean": 0.12416176497936249, "step": 1501 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24381491541862488, "epoch": 3.9526315789473685, "grad_norm": 0.0022116934414952993, "learning_rate": 1e-06, "loss": -0.0011, "step": 1502 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.23999455571174622, "epoch": 3.955263157894737, "grad_norm": 0.0011692576808854938, "learning_rate": 1e-06, "loss": 0.0001, "step": 1503 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2409554198384285, "epoch": 3.957894736842105, "grad_norm": 0.0009114885469898582, "learning_rate": 1e-06, "loss": 0.0006, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 277.619140625, "completions/mean_terminated_length": 277.619140625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.24572695791721344, "epoch": 3.9605263157894735, "frac_reward_zero_std": 0.8125, "grad_norm": 0.00424238620325923, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 295397230.0, "reward": 0.8301756381988525, "reward_std": 0.032026953995227814, "rewards/progression_diversity/mean": -1.1566195098566823e-05, "rewards/progression_diversity/std": 0.00026171313947997987, "rewards/symbolic_reward_accuracy/mean": 0.896484375, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.9742838144302368, "rewards/symbolic_reward_partial_score/std": 0.08367852121591568, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0614919662475586, "sampling/importance_sampling_ratio/min": 0.0001828877575462684, "sampling/sampling_logp_difference/max": 8.606637954711914, "sampling/sampling_logp_difference/mean": 0.12657146155834198, "step": 1505 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2439851313829422, "epoch": 3.963157894736842, "grad_norm": 0.004110608249902725, "learning_rate": 1e-06, "loss": 0.0011, "step": 1506 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.24492160975933075, "epoch": 3.9657894736842105, "grad_norm": 0.0007968592108227313, "learning_rate": 1e-06, "loss": -0.0006, "step": 1507 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2445434406399727, "epoch": 3.968421052631579, "grad_norm": 0.0011729674879461527, "learning_rate": 1e-06, "loss": -0.0002, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 313.478515625, "completions/mean_terminated_length": 282.02935791015625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.24183562397956848, "epoch": 3.9710526315789476, "frac_reward_zero_std": 0.75, "grad_norm": 0.009790533222258091, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 295987907.0, "reward": 0.7889963388442993, "reward_std": 0.03885188698768616, "rewards/progression_diversity/mean": -0.0017403773963451385, "rewards/progression_diversity/std": 0.0372454933822155, "rewards/symbolic_reward_accuracy/mean": 0.841796875, "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, "rewards/symbolic_reward_partial_score/mean": 0.9471028447151184, "rewards/symbolic_reward_partial_score/std": 0.13628406822681427, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0608797073364258, "sampling/importance_sampling_ratio/min": 0.0006632054573856294, "sampling/sampling_logp_difference/max": 7.31842565536499, "sampling/sampling_logp_difference/mean": 0.12243205308914185, "step": 1509 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.15625, "entropy": 0.24036569148302078, "epoch": 3.973684210526316, "grad_norm": 0.0021023740991950035, "learning_rate": 1e-06, "loss": 0.0112, "step": 1510 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24389879405498505, "epoch": 3.9763157894736842, "grad_norm": 0.003945766016840935, "learning_rate": 1e-06, "loss": -0.0011, "step": 1511 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.24920684099197388, "epoch": 3.9789473684210526, "grad_norm": 0.0016033814754337072, "learning_rate": 1e-06, "loss": -0.0001, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 280.58203125, "completions/mean_terminated_length": 280.58203125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.24790653586387634, "epoch": 3.981578947368421, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0030526912305504084, "learning_rate": 1e-06, "loss": -0.001, "num_tokens": 296541709.0, "reward": 0.8328123092651367, "reward_std": 0.06292681396007538, "rewards/progression_diversity/mean": -1.7364176528644748e-05, "rewards/progression_diversity/std": 0.000392906425986439, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9713541865348816, "rewards/symbolic_reward_partial_score/std": 0.09234537184238434, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0622069835662842, "sampling/importance_sampling_ratio/min": 0.0017897688085213304, "sampling/sampling_logp_difference/max": 6.325668811798096, "sampling/sampling_logp_difference/mean": 0.12474418431520462, "step": 1513 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.2466568797826767, "epoch": 3.984210526315789, "grad_norm": 0.0044132862240076065, "learning_rate": 1e-06, "loss": -0.0004, "step": 1514 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.2421964779496193, "epoch": 3.986842105263158, "grad_norm": 0.003211502218618989, "learning_rate": 1e-06, "loss": 0.0016, "step": 1515 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1484375, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1953125, "entropy": 0.24317516386508942, "epoch": 3.9894736842105263, "grad_norm": 0.0014716348377987742, "learning_rate": 1e-06, "loss": -0.0002, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 281.439453125, "completions/mean_terminated_length": 281.439453125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.24056368321180344, "epoch": 3.9921052631578946, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0032884979154914618, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 297095630.0, "reward": 0.8452636003494263, "reward_std": 0.03771102428436279, "rewards/progression_diversity/mean": -1.1692754924297333e-05, "rewards/progression_diversity/std": 0.00026457683998160064, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9659830331802368, "rewards/symbolic_reward_partial_score/std": 0.1378202587366104, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0619913339614868, "sampling/importance_sampling_ratio/min": 0.0008337947074323893, "sampling/sampling_logp_difference/max": 7.0895233154296875, "sampling/sampling_logp_difference/mean": 0.12588346004486084, "step": 1517 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2486308068037033, "epoch": 3.9947368421052634, "grad_norm": 0.0011405708501115441, "learning_rate": 1e-06, "loss": -0.0005, "step": 1518 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.24828965216875076, "epoch": 3.9973684210526317, "grad_norm": 0.002011633710935712, "learning_rate": 1e-06, "loss": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2471499964594841, "epoch": 4.0, "grad_norm": 0.0017826375551521778, "learning_rate": 1e-06, "loss": -0.0005, "step": 1520 }, { "epoch": 4.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0009765625, "eval_completions/max_length": 1970.21875, "eval_completions/max_terminated_length": 543.8125, "eval_completions/mean_length": 294.76171875, "eval_completions/mean_terminated_length": 279.04593563079834, "eval_completions/min_length": 161.0625, "eval_completions/min_terminated_length": 161.0625, "eval_entropy": 0.2479579197242856, "eval_frac_reward_zero_std": 0.79296875, "eval_loss": 0.0009922023164108396, "eval_num_tokens": 297095630.0, "eval_reward": 0.8440247587859631, "eval_reward_std": 0.04477399826329709, "eval_rewards/progression_diversity/mean": -0.0006040489212182365, "eval_rewards/progression_diversity/std": 0.005385569964801107, "eval_rewards/symbolic_reward_accuracy/mean": 0.9228515625, "eval_rewards/symbolic_reward_accuracy/std": 0.2023696736432612, "eval_rewards/symbolic_reward_partial_score/mean": 0.9701741561293602, "eval_rewards/symbolic_reward_partial_score/std": 0.08667883859016001, "eval_rewards/tag_count_reward/mean": -0.00732421875, "eval_rewards/tag_count_reward/std": 0.03229685686528683, "eval_runtime": 251.604, "eval_samples_per_second": 0.994, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.061691664159298, "eval_sampling/importance_sampling_ratio/min": 0.004686991564540222, "eval_sampling/sampling_logp_difference/max": 14.263333037495613, "eval_sampling/sampling_logp_difference/mean": 0.12779978360049427, "eval_steps_per_second": 0.008, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 285.51171875, "completions/mean_terminated_length": 285.51171875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "entropy": 0.24478785693645477, "epoch": 4.002631578947368, "frac_reward_zero_std": 0.65625, "grad_norm": 0.007620229385793209, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 297658292.0, "reward": 0.7886228561401367, "reward_std": 0.08386802673339844, "rewards/progression_diversity/mean": -1.9863946363329887e-05, "rewards/progression_diversity/std": 0.00044946977868676186, "rewards/symbolic_reward_accuracy/mean": 0.84765625, "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, "rewards/symbolic_reward_partial_score/mean": 0.9334310293197632, "rewards/symbolic_reward_partial_score/std": 0.16388244926929474, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0625085830688477, "sampling/importance_sampling_ratio/min": 0.0003958561283070594, "sampling/sampling_logp_difference/max": 7.8344597816467285, "sampling/sampling_logp_difference/mean": 0.1256425380706787, "step": 1521 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2409704327583313, "epoch": 4.005263157894737, "grad_norm": 0.0029928318690508604, "learning_rate": 1e-06, "loss": -0.0008, "step": 1522 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1796875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.21875, "entropy": 0.24776104092597961, "epoch": 4.007894736842105, "grad_norm": 0.009631451219320297, "learning_rate": 1e-06, "loss": 0.0002, "step": 1523 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1796875, "entropy": 0.24515005946159363, "epoch": 4.010526315789473, "grad_norm": 0.002499483060091734, "learning_rate": 1e-06, "loss": 0.0012, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 281.181640625, "completions/mean_terminated_length": 281.181640625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.24500173330307007, "epoch": 4.0131578947368425, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0030381688848137856, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 298184369.0, "reward": 0.866503119468689, "reward_std": 0.03898514062166214, "rewards/progression_diversity/mean": -8.544186130166054e-05, "rewards/progression_diversity/std": 0.001358157955110073, "rewards/symbolic_reward_accuracy/mean": 0.953125, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.9820963740348816, "rewards/symbolic_reward_partial_score/std": 0.08295471221208572, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0618839263916016, "sampling/importance_sampling_ratio/min": 0.00013820610183756799, "sampling/sampling_logp_difference/max": 8.886764526367188, "sampling/sampling_logp_difference/mean": 0.12790223956108093, "step": 1525 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2474457398056984, "epoch": 4.015789473684211, "grad_norm": 0.002514457330107689, "learning_rate": 1e-06, "loss": -0.0005, "step": 1526 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24642843008041382, "epoch": 4.018421052631579, "grad_norm": 0.0025110999122262, "learning_rate": 1e-06, "loss": -0.0009, "step": 1527 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2506309896707535, "epoch": 4.021052631578947, "grad_norm": 0.004548352677375078, "learning_rate": 1e-06, "loss": 0.0019, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 284.740234375, "completions/mean_terminated_length": 284.740234375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.2474454790353775, "epoch": 4.023684210526316, "frac_reward_zero_std": 0.84375, "grad_norm": 0.003918915521353483, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 298728972.0, "reward": 0.8486324548721313, "reward_std": 0.028091946616768837, "rewards/progression_diversity/mean": -4.3650590669130906e-05, "rewards/progression_diversity/std": 0.0009877000702545047, "rewards/symbolic_reward_accuracy/mean": 0.921875, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.9850260019302368, "rewards/symbolic_reward_partial_score/std": 0.055599283427000046, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0621516704559326, "sampling/importance_sampling_ratio/min": 0.0001810384273994714, "sampling/sampling_logp_difference/max": 8.616801261901855, "sampling/sampling_logp_difference/mean": 0.12489888072013855, "step": 1529 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.23918018490076065, "epoch": 4.026315789473684, "grad_norm": 0.002114385599270463, "learning_rate": 1e-06, "loss": 0.0011, "step": 1530 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24100320041179657, "epoch": 4.028947368421052, "grad_norm": 0.000925762637052685, "learning_rate": 1e-06, "loss": -0.0009, "step": 1531 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2428521066904068, "epoch": 4.031578947368421, "grad_norm": 0.0010113362222909927, "learning_rate": 1e-06, "loss": -0.0006, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 283.44140625, "completions/mean_terminated_length": 283.44140625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.24821541458368301, "epoch": 4.03421052631579, "frac_reward_zero_std": 0.84375, "grad_norm": 0.003675073618069291, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 299262222.0, "reward": 0.8415029644966125, "reward_std": 0.03057749569416046, "rewards/progression_diversity/mean": -0.00010043365909950808, "rewards/progression_diversity/std": 0.0022725542075932026, "rewards/symbolic_reward_accuracy/mean": 0.916015625, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.9729817509651184, "rewards/symbolic_reward_partial_score/std": 0.10110417753458023, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0609973669052124, "sampling/importance_sampling_ratio/min": 0.0004463788354769349, "sampling/sampling_logp_difference/max": 7.7143425941467285, "sampling/sampling_logp_difference/mean": 0.12613673508167267, "step": 1533 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2426435500383377, "epoch": 4.036842105263158, "grad_norm": 0.0008600465371273458, "learning_rate": 1e-06, "loss": 0.0001, "step": 1534 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24244511872529984, "epoch": 4.0394736842105265, "grad_norm": 0.0012452512746676803, "learning_rate": 1e-06, "loss": -0.0007, "step": 1535 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.24515938013792038, "epoch": 4.042105263157895, "grad_norm": 0.004650202114135027, "learning_rate": 1e-06, "loss": 0.0008, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 282.837890625, "completions/mean_terminated_length": 282.837890625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.2445775270462036, "epoch": 4.044736842105263, "frac_reward_zero_std": 0.875, "grad_norm": 0.000899067847058177, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 299790811.0, "reward": 0.8743164539337158, "reward_std": 0.02148437686264515, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9609375, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.9925130009651184, "rewards/symbolic_reward_partial_score/std": 0.040022555738687515, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061580777168274, "sampling/importance_sampling_ratio/min": 0.0009474054677411914, "sampling/sampling_logp_difference/max": 6.961783409118652, "sampling/sampling_logp_difference/mean": 0.12686872482299805, "step": 1537 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2427464723587036, "epoch": 4.0473684210526315, "grad_norm": 0.0006039505824446678, "learning_rate": 1e-06, "loss": 0.0007, "step": 1538 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.24190915375947952, "epoch": 4.05, "grad_norm": 0.0006737467483617365, "learning_rate": 1e-06, "loss": -0.0006, "step": 1539 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.24564451724290848, "epoch": 4.052631578947368, "grad_norm": 0.0006470574298873544, "learning_rate": 1e-06, "loss": 0.0007, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 286.171875, "completions/mean_terminated_length": 286.171875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "entropy": 0.24494989961385727, "epoch": 4.0552631578947365, "frac_reward_zero_std": 0.78125, "grad_norm": 0.00690782256424427, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 300320787.0, "reward": 0.8101066946983337, "reward_std": 0.04871155321598053, "rewards/progression_diversity/mean": -7.603035192005336e-05, "rewards/progression_diversity/std": 0.0017203704919666052, "rewards/symbolic_reward_accuracy/mean": 0.87109375, "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, "rewards/symbolic_reward_partial_score/mean": 0.9581705331802368, "rewards/symbolic_reward_partial_score/std": 0.11796627193689346, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0622797012329102, "sampling/importance_sampling_ratio/min": 0.00044912920566275716, "sampling/sampling_logp_difference/max": 7.708199977874756, "sampling/sampling_logp_difference/mean": 0.12562555074691772, "step": 1541 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.24104497581720352, "epoch": 4.057894736842106, "grad_norm": 0.0021957457065582275, "learning_rate": 1e-06, "loss": 0.0006, "step": 1542 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.250149168074131, "epoch": 4.060526315789474, "grad_norm": 0.0024236314930021763, "learning_rate": 1e-06, "loss": -0.0002, "step": 1543 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.24035638570785522, "epoch": 4.063157894736842, "grad_norm": 0.002043983433395624, "learning_rate": 1e-06, "loss": 0.0006, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 281.3984375, "completions/mean_terminated_length": 281.3984375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.24575192481279373, "epoch": 4.065789473684211, "frac_reward_zero_std": 0.75, "grad_norm": 0.0030977753922343254, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 300856319.0, "reward": 0.7493163347244263, "reward_std": 0.05255415290594101, "rewards/progression_diversity/mean": -1.282830362470122e-05, "rewards/progression_diversity/std": 0.0002902713604271412, "rewards/symbolic_reward_accuracy/mean": 0.78515625, "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, "rewards/symbolic_reward_partial_score/mean": 0.9274088144302368, "rewards/symbolic_reward_partial_score/std": 0.1551593393087387, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.06221604347229, "sampling/importance_sampling_ratio/min": 0.0009545637876726687, "sampling/sampling_logp_difference/max": 6.954256057739258, "sampling/sampling_logp_difference/mean": 0.12480530142784119, "step": 1545 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2471490129828453, "epoch": 4.068421052631579, "grad_norm": 0.007246929686516523, "learning_rate": 1e-06, "loss": 0.0008, "step": 1546 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2453026995062828, "epoch": 4.071052631578947, "grad_norm": 0.0015278997598215938, "learning_rate": 1e-06, "loss": 0.0009, "step": 1547 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.2455468699336052, "epoch": 4.073684210526316, "grad_norm": 0.006542537361383438, "learning_rate": 1e-06, "loss": -0.0017, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 282.92578125, "completions/mean_terminated_length": 282.92578125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "entropy": 0.2415642812848091, "epoch": 4.076315789473684, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0035267826169729233, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 301425337.0, "reward": 0.8650867938995361, "reward_std": 0.03978986665606499, "rewards/progression_diversity/mean": -0.00011109215120086446, "rewards/progression_diversity/std": 0.0012798713287338614, "rewards/symbolic_reward_accuracy/mean": 0.94921875, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.9851887822151184, "rewards/symbolic_reward_partial_score/std": 0.07398899644613266, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060722827911377, "sampling/importance_sampling_ratio/min": 0.00038356988807208836, "sampling/sampling_logp_difference/max": 7.865988731384277, "sampling/sampling_logp_difference/mean": 0.1256132274866104, "step": 1549 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24005647003650665, "epoch": 4.078947368421052, "grad_norm": 0.0025566022377461195, "learning_rate": 1e-06, "loss": -0.0003, "step": 1550 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.24505149573087692, "epoch": 4.081578947368421, "grad_norm": 0.0026614954695105553, "learning_rate": 1e-06, "loss": 0.0003, "step": 1551 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.24144987761974335, "epoch": 4.08421052631579, "grad_norm": 0.002677714917808771, "learning_rate": 1e-06, "loss": 0.0002, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 285.47265625, "completions/mean_terminated_length": 285.47265625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.24059565365314484, "epoch": 4.086842105263158, "frac_reward_zero_std": 0.65625, "grad_norm": 0.004622668959200382, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 301972299.0, "reward": 0.8511713743209839, "reward_std": 0.07054747641086578, "rewards/progression_diversity/mean": -5.1332837756490335e-05, "rewards/progression_diversity/std": 0.0011615295661613345, "rewards/symbolic_reward_accuracy/mean": 0.9296875, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.9778646230697632, "rewards/symbolic_reward_partial_score/std": 0.0910392627120018, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0624241828918457, "sampling/importance_sampling_ratio/min": 7.295302202692255e-05, "sampling/sampling_logp_difference/max": 9.525694847106934, "sampling/sampling_logp_difference/mean": 0.12439781427383423, "step": 1553 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.21875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2421875, "entropy": 0.24092917144298553, "epoch": 4.089473684210526, "grad_norm": 0.0032965033315122128, "learning_rate": 1e-06, "loss": -0.0008, "step": 1554 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.2109375, "entropy": 0.24477685242891312, "epoch": 4.092105263157895, "grad_norm": 0.004428436979651451, "learning_rate": 1e-06, "loss": 0.0008, "step": 1555 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1640625, "entropy": 0.24361905455589294, "epoch": 4.094736842105263, "grad_norm": 0.003907477483153343, "learning_rate": 1e-06, "loss": 0.0006, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 320.65625, "completions/mean_terminated_length": 289.22113037109375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2397397980093956, "epoch": 4.097368421052631, "frac_reward_zero_std": 0.875, "grad_norm": 0.0034273923374712467, "learning_rate": 1e-06, "loss": 0.0113, "num_tokens": 302529755.0, "reward": 0.8220639824867249, "reward_std": 0.04169125109910965, "rewards/progression_diversity/mean": -0.0006369180628098547, "rewards/progression_diversity/std": 0.014411810785531998, "rewards/symbolic_reward_accuracy/mean": 0.896484375, "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, "rewards/symbolic_reward_partial_score/mean": 0.9479166269302368, "rewards/symbolic_reward_partial_score/std": 0.17478591203689575, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0619392395019531, "sampling/importance_sampling_ratio/min": 0.0002244680217700079, "sampling/sampling_logp_difference/max": 8.401777267456055, "sampling/sampling_logp_difference/mean": 0.12284423410892487, "step": 1557 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.24551428854465485, "epoch": 4.1, "grad_norm": 0.0029721097089350224, "learning_rate": 1e-06, "loss": 0.0006, "step": 1558 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2473151981830597, "epoch": 4.102631578947369, "grad_norm": 0.0037502862978726625, "learning_rate": 1e-06, "loss": -0.0008, "step": 1559 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.24894528090953827, "epoch": 4.105263157894737, "grad_norm": 0.004042802378535271, "learning_rate": 1e-06, "loss": 0.0009, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 290.775390625, "completions/mean_terminated_length": 290.775390625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.2489028424024582, "epoch": 4.1078947368421055, "frac_reward_zero_std": 0.78125, "grad_norm": 0.004036257974803448, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 303060744.0, "reward": 0.8253406882286072, "reward_std": 0.043370090425014496, "rewards/progression_diversity/mean": -0.00011065526632592082, "rewards/progression_diversity/std": 0.0018014537636190653, "rewards/symbolic_reward_accuracy/mean": 0.890625, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.9698892831802368, "rewards/symbolic_reward_partial_score/std": 0.09768425673246384, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0613582134246826, "sampling/importance_sampling_ratio/min": 1.6307029000017792e-05, "sampling/sampling_logp_difference/max": 11.023914337158203, "sampling/sampling_logp_difference/mean": 0.12509672343730927, "step": 1561 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24281351268291473, "epoch": 4.110526315789474, "grad_norm": 0.004401057027280331, "learning_rate": 1e-06, "loss": 0.0011, "step": 1562 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24352271109819412, "epoch": 4.113157894736842, "grad_norm": 0.002822014968842268, "learning_rate": 1e-06, "loss": -0.0001, "step": 1563 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2425449937582016, "epoch": 4.11578947368421, "grad_norm": 0.0014352428261190653, "learning_rate": 1e-06, "loss": -0.0008, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 291.498046875, "completions/mean_terminated_length": 291.498046875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "entropy": 0.24744796752929688, "epoch": 4.118421052631579, "frac_reward_zero_std": 0.78125, "grad_norm": 0.008502251468598843, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 303627815.0, "reward": 0.7929680347442627, "reward_std": 0.04630649834871292, "rewards/progression_diversity/mean": -7.568293221993372e-05, "rewards/progression_diversity/std": 0.001712509198114276, "rewards/symbolic_reward_accuracy/mean": 0.85546875, "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, "rewards/symbolic_reward_partial_score/mean": 0.9322916865348816, "rewards/symbolic_reward_partial_score/std": 0.17656517028808594, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0619150400161743, "sampling/importance_sampling_ratio/min": 4.7659916162956506e-05, "sampling/sampling_logp_difference/max": 9.951419830322266, "sampling/sampling_logp_difference/mean": 0.12444620579481125, "step": 1565 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2407190129160881, "epoch": 4.121052631578947, "grad_norm": 0.0018108681542798877, "learning_rate": 1e-06, "loss": 0.0006, "step": 1566 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24351371824741364, "epoch": 4.123684210526315, "grad_norm": 0.0014526962768286467, "learning_rate": 1e-06, "loss": 0.0003, "step": 1567 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24052952975034714, "epoch": 4.126315789473685, "grad_norm": 0.0031579281203448772, "learning_rate": 1e-06, "loss": -0.0012, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 292.951171875, "completions/mean_terminated_length": 292.951171875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "entropy": 0.2511454373598099, "epoch": 4.128947368421053, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0025576853659003973, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 304146926.0, "reward": 0.8509761691093445, "reward_std": 0.013778343796730042, "rewards/progression_diversity/mean": -4.358722799224779e-05, "rewards/progression_diversity/std": 0.0006232060259208083, "rewards/symbolic_reward_accuracy/mean": 0.9296875, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.9772135615348816, "rewards/symbolic_reward_partial_score/std": 0.08322879672050476, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0625452995300293, "sampling/importance_sampling_ratio/min": 0.004068870563060045, "sampling/sampling_logp_difference/max": 5.504389762878418, "sampling/sampling_logp_difference/mean": 0.12552489340305328, "step": 1569 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.24181534349918365, "epoch": 4.131578947368421, "grad_norm": 0.0026575892698019743, "learning_rate": 1e-06, "loss": 0.0001, "step": 1570 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.24096699804067612, "epoch": 4.13421052631579, "grad_norm": 0.0008206532220356166, "learning_rate": 1e-06, "loss": 0.0004, "step": 1571 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.24758878350257874, "epoch": 4.136842105263158, "grad_norm": 0.00010886161908274516, "learning_rate": 1e-06, "loss": -0.0005, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 294.57421875, "completions/mean_terminated_length": 294.57421875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.24195092916488647, "epoch": 4.139473684210526, "frac_reward_zero_std": 0.75, "grad_norm": 0.006056688260287046, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 304681204.0, "reward": 0.8438476324081421, "reward_std": 0.04485977441072464, "rewards/progression_diversity/mean": -7.551363069069339e-06, "rewards/progression_diversity/std": 0.00017086784646380693, "rewards/symbolic_reward_accuracy/mean": 0.916015625, "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, "rewards/symbolic_reward_partial_score/mean": 0.9807943105697632, "rewards/symbolic_reward_partial_score/std": 0.06602808088064194, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0608805418014526, "sampling/importance_sampling_ratio/min": 0.002050866838544607, "sampling/sampling_logp_difference/max": 6.189492702484131, "sampling/sampling_logp_difference/mean": 0.1235751360654831, "step": 1573 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2409624606370926, "epoch": 4.1421052631578945, "grad_norm": 0.0029652619268745184, "learning_rate": 1e-06, "loss": 0.0006, "step": 1574 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24594729393720627, "epoch": 4.144736842105263, "grad_norm": 0.0016559745417907834, "learning_rate": 1e-06, "loss": 0.0006, "step": 1575 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.24038663506507874, "epoch": 4.147368421052631, "grad_norm": 0.002225496107712388, "learning_rate": 1e-06, "loss": -0.0003, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 289.96875, "completions/mean_terminated_length": 289.96875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "entropy": 0.24104474484920502, "epoch": 4.15, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0045051174238324165, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 305244484.0, "reward": 0.8866699934005737, "reward_std": 0.03702239692211151, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.982421875, "rewards/symbolic_reward_accuracy/std": 0.13154059648513794, "rewards/symbolic_reward_partial_score/mean": 0.99072265625, "rewards/symbolic_reward_partial_score/std": 0.07524821907281876, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059924602508545, "sampling/importance_sampling_ratio/min": 0.0008238049340434372, "sampling/sampling_logp_difference/max": 7.101576805114746, "sampling/sampling_logp_difference/mean": 0.12273959070444107, "step": 1577 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23388678580522537, "epoch": 4.152631578947369, "grad_norm": 0.0015148852253332734, "learning_rate": 1e-06, "loss": -0.0007, "step": 1578 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.24179504066705704, "epoch": 4.155263157894737, "grad_norm": 0.00393514521420002, "learning_rate": 1e-06, "loss": -0.0001, "step": 1579 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.24486732482910156, "epoch": 4.157894736842105, "grad_norm": 0.0008323483052663505, "learning_rate": 1e-06, "loss": -0.0002, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 324.146484375, "completions/mean_terminated_length": 292.71820068359375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "entropy": 0.2383657991886139, "epoch": 4.160526315789474, "frac_reward_zero_std": 0.75, "grad_norm": 0.007527621928602457, "learning_rate": 1e-06, "loss": 0.0112, "num_tokens": 305845135.0, "reward": 0.7925710082054138, "reward_std": 0.04505772143602371, "rewards/progression_diversity/mean": -0.0007171723991632462, "rewards/progression_diversity/std": 0.01562389824539423, "rewards/symbolic_reward_accuracy/mean": 0.845703125, "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, "rewards/symbolic_reward_partial_score/mean": 0.951171875, "rewards/symbolic_reward_partial_score/std": 0.12982474267482758, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0593748092651367, "sampling/importance_sampling_ratio/min": 4.811527674064564e-07, "sampling/sampling_logp_difference/max": 14.547080993652344, "sampling/sampling_logp_difference/mean": 0.11983726918697357, "step": 1581 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.23435619473457336, "epoch": 4.163157894736842, "grad_norm": 0.0012159182224422693, "learning_rate": 1e-06, "loss": 0.0005, "step": 1582 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.1328125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.140625, "entropy": 0.23912250995635986, "epoch": 4.16578947368421, "grad_norm": 0.0016524328384548426, "learning_rate": 1e-06, "loss": -0.0006, "step": 1583 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.24548863619565964, "epoch": 4.168421052631579, "grad_norm": 0.0025580108631402254, "learning_rate": 1e-06, "loss": -0.0001, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 317.92578125, "completions/mean_terminated_length": 286.4853210449219, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.24263640493154526, "epoch": 4.171052631578948, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0040906872600317, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 306410377.0, "reward": 0.8318202495574951, "reward_std": 0.013329317793250084, "rewards/progression_diversity/mean": -0.0015740538947284222, "rewards/progression_diversity/std": 0.03561677411198616, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9680989384651184, "rewards/symbolic_reward_partial_score/std": 0.10613958537578583, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0580620765686035, "sampling/importance_sampling_ratio/min": 1.972946165551548e-06, "sampling/sampling_logp_difference/max": 13.135982513427734, "sampling/sampling_logp_difference/mean": 0.12163875997066498, "step": 1585 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23799894750118256, "epoch": 4.173684210526316, "grad_norm": 0.0008383460226468742, "learning_rate": 1e-06, "loss": -0.0007, "step": 1586 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.23699834197759628, "epoch": 4.176315789473684, "grad_norm": 0.0012354847276583314, "learning_rate": 1e-06, "loss": 0.0001, "step": 1587 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23717937618494034, "epoch": 4.178947368421053, "grad_norm": 0.000985459191724658, "learning_rate": 1e-06, "loss": 0.0277, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 287.171875, "completions/mean_terminated_length": 287.171875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.24291031062602997, "epoch": 4.181578947368421, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0010142133105546236, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 306943873.0, "reward": 0.7869137525558472, "reward_std": 0.011718656867742538, "rewards/progression_diversity/mean": -3.545811341609806e-05, "rewards/progression_diversity/std": 0.0008023255504667759, "rewards/symbolic_reward_accuracy/mean": 0.83984375, "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, "rewards/symbolic_reward_partial_score/mean": 0.943359375, "rewards/symbolic_reward_partial_score/std": 0.14044690132141113, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0599687099456787, "sampling/importance_sampling_ratio/min": 6.863525049993768e-05, "sampling/sampling_logp_difference/max": 9.58670425415039, "sampling/sampling_logp_difference/mean": 0.1256382167339325, "step": 1589 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.2450830414891243, "epoch": 4.184210526315789, "grad_norm": 0.0008929637842811644, "learning_rate": 1e-06, "loss": -0.0003, "step": 1590 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2389954999089241, "epoch": 4.186842105263158, "grad_norm": 0.0004206775629427284, "learning_rate": 1e-06, "loss": -0.0003, "step": 1591 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.24132828414440155, "epoch": 4.189473684210526, "grad_norm": 0.000518023211043328, "learning_rate": 1e-06, "loss": -0.0003, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 286.953125, "completions/mean_terminated_length": 286.953125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.2418007254600525, "epoch": 4.192105263157894, "frac_reward_zero_std": 0.875, "grad_norm": 0.0034422457683831453, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 307499113.0, "reward": 0.8122069835662842, "reward_std": 0.018359720706939697, "rewards/progression_diversity/mean": -8.605420589447021e-06, "rewards/progression_diversity/std": 0.00013983974349685013, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9378255605697632, "rewards/symbolic_reward_partial_score/std": 0.1756001114845276, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.062021017074585, "sampling/importance_sampling_ratio/min": 9.0235989773646e-06, "sampling/sampling_logp_difference/max": 11.615667343139648, "sampling/sampling_logp_difference/mean": 0.12551844120025635, "step": 1593 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.24229955673217773, "epoch": 4.1947368421052635, "grad_norm": 0.0018708358984440565, "learning_rate": 1e-06, "loss": -0.0, "step": 1594 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24395088106393814, "epoch": 4.197368421052632, "grad_norm": 0.0030785081908106804, "learning_rate": 1e-06, "loss": -0.0004, "step": 1595 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23724327981472015, "epoch": 4.2, "grad_norm": 0.001542367972433567, "learning_rate": 1e-06, "loss": 0.0004, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 321.71875, "completions/mean_terminated_length": 290.28570556640625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.24105414748191833, "epoch": 4.2026315789473685, "frac_reward_zero_std": 0.875, "grad_norm": 0.0008341679349541664, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 308087993.0, "reward": 0.7685401439666748, "reward_std": 0.012891988269984722, "rewards/progression_diversity/mean": -0.0014598432462662458, "rewards/progression_diversity/std": 0.031166821718215942, "rewards/symbolic_reward_accuracy/mean": 0.8125, "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, "rewards/symbolic_reward_partial_score/mean": 0.9375, "rewards/symbolic_reward_partial_score/std": 0.1478261649608612, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060511827468872, "sampling/importance_sampling_ratio/min": 0.00011906772124348208, "sampling/sampling_logp_difference/max": 9.035818099975586, "sampling/sampling_logp_difference/mean": 0.12241839617490768, "step": 1597 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2469538226723671, "epoch": 4.205263157894737, "grad_norm": 0.0006272746832109988, "learning_rate": 1e-06, "loss": -0.0, "step": 1598 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23778608441352844, "epoch": 4.207894736842105, "grad_norm": 0.0050039771012961864, "learning_rate": 1e-06, "loss": -0.0003, "step": 1599 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2375095784664154, "epoch": 4.2105263157894735, "grad_norm": 0.0005078950780443847, "learning_rate": 1e-06, "loss": 0.0292, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 290.8203125, "completions/mean_terminated_length": 290.8203125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.2439725548028946, "epoch": 4.213157894736842, "frac_reward_zero_std": 0.90625, "grad_norm": 0.00519453315064311, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 308643709.0, "reward": 0.7693848013877869, "reward_std": 0.023179786279797554, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.806640625, "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, "rewards/symbolic_reward_partial_score/mean": 0.9513345956802368, "rewards/symbolic_reward_partial_score/std": 0.11196061223745346, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0619571208953857, "sampling/importance_sampling_ratio/min": 0.001721260487101972, "sampling/sampling_logp_difference/max": 6.36469841003418, "sampling/sampling_logp_difference/mean": 0.12522411346435547, "step": 1601 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.24038860201835632, "epoch": 4.215789473684211, "grad_norm": 0.0011393935419619083, "learning_rate": 1e-06, "loss": -0.0003, "step": 1602 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.24182245135307312, "epoch": 4.218421052631579, "grad_norm": 0.0009637880139052868, "learning_rate": 1e-06, "loss": -0.0005, "step": 1603 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.24568534642457962, "epoch": 4.221052631578948, "grad_norm": 0.004586696624755859, "learning_rate": 1e-06, "loss": 0.001, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 290.912109375, "completions/mean_terminated_length": 290.912109375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "entropy": 0.24403854459524155, "epoch": 4.223684210526316, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0022931001149117947, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 309202480.0, "reward": 0.8104981184005737, "reward_std": 0.032899536192417145, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.876953125, "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, "rewards/symbolic_reward_partial_score/mean": 0.94775390625, "rewards/symbolic_reward_partial_score/std": 0.14531104266643524, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0620423555374146, "sampling/importance_sampling_ratio/min": 1.6925074305618182e-05, "sampling/sampling_logp_difference/max": 10.986714363098145, "sampling/sampling_logp_difference/mean": 0.1268502175807953, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2432192713022232, "epoch": 4.226315789473684, "grad_norm": 0.0034322640858590603, "learning_rate": 1e-06, "loss": 0.0012, "step": 1606 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2459331378340721, "epoch": 4.228947368421053, "grad_norm": 0.00626370171085, "learning_rate": 1e-06, "loss": -0.0002, "step": 1607 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.24618712812662125, "epoch": 4.231578947368421, "grad_norm": 0.0018219529883936048, "learning_rate": 1e-06, "loss": -0.0005, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 285.6875, "completions/mean_terminated_length": 285.6875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.24345583468675613, "epoch": 4.234210526315789, "frac_reward_zero_std": 0.875, "grad_norm": 0.003988837357610464, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 309748208.0, "reward": 0.87353515625, "reward_std": 0.031940147280693054, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.962890625, "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, "rewards/symbolic_reward_partial_score/mean": 0.9860026240348816, "rewards/symbolic_reward_partial_score/std": 0.0735032856464386, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0620702505111694, "sampling/importance_sampling_ratio/min": 0.00010400653991382569, "sampling/sampling_logp_difference/max": 9.171056747436523, "sampling/sampling_logp_difference/mean": 0.12525449693202972, "step": 1609 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2439560666680336, "epoch": 4.2368421052631575, "grad_norm": 0.005749448202550411, "learning_rate": 1e-06, "loss": -0.0001, "step": 1610 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.242917962372303, "epoch": 4.239473684210527, "grad_norm": 0.0013718483969569206, "learning_rate": 1e-06, "loss": -0.0004, "step": 1611 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24474342167377472, "epoch": 4.242105263157895, "grad_norm": 0.0005026358412578702, "learning_rate": 1e-06, "loss": 0.0008, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 288.400390625, "completions/mean_terminated_length": 288.400390625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "entropy": 0.24127425998449326, "epoch": 4.244736842105263, "frac_reward_zero_std": 0.78125, "grad_norm": 0.0018319800728932023, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 310305693.0, "reward": 0.8312011957168579, "reward_std": 0.03613283112645149, "rewards/progression_diversity/mean": -4.250822485118988e-07, "rewards/progression_diversity/std": 9.618513104214799e-06, "rewards/symbolic_reward_accuracy/mean": 0.90625, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.9581705331802368, "rewards/symbolic_reward_partial_score/std": 0.13332508504390717, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061629295349121, "sampling/importance_sampling_ratio/min": 0.0001460511703044176, "sampling/sampling_logp_difference/max": 8.83155345916748, "sampling/sampling_logp_difference/mean": 0.12399622797966003, "step": 1613 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.24228737503290176, "epoch": 4.247368421052632, "grad_norm": 0.0009619845659472048, "learning_rate": 1e-06, "loss": 0.0001, "step": 1614 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.23787134885787964, "epoch": 4.25, "grad_norm": 0.0031648194417357445, "learning_rate": 1e-06, "loss": -0.0006, "step": 1615 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.24200475215911865, "epoch": 4.252631578947368, "grad_norm": 0.0008313195430673659, "learning_rate": 1e-06, "loss": 0.0001, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 285.142578125, "completions/mean_terminated_length": 285.142578125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.23958918452262878, "epoch": 4.255263157894737, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0013227357994765043, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 310838470.0, "reward": 0.8195312023162842, "reward_std": 0.00983283668756485, "rewards/progression_diversity/mean": -6.325509730231715e-06, "rewards/progression_diversity/std": 0.0001431299460818991, "rewards/symbolic_reward_accuracy/mean": 0.8828125, "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, "rewards/symbolic_reward_partial_score/mean": 0.9661458134651184, "rewards/symbolic_reward_partial_score/std": 0.10655689239501953, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060362696647644, "sampling/importance_sampling_ratio/min": 0.00016181122919078916, "sampling/sampling_logp_difference/max": 8.729080200195312, "sampling/sampling_logp_difference/mean": 0.12306272983551025, "step": 1617 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2417643666267395, "epoch": 4.257894736842105, "grad_norm": 0.001047020428813994, "learning_rate": 1e-06, "loss": 0.0002, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23926913738250732, "epoch": 4.260526315789473, "grad_norm": 0.0005425025010481477, "learning_rate": 1e-06, "loss": 0.0005, "step": 1619 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.24134934693574905, "epoch": 4.2631578947368425, "grad_norm": 2.3157368559623137e-05, "learning_rate": 1e-06, "loss": -0.0005, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 279.1875, "completions/mean_terminated_length": 279.1875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.24217140674591064, "epoch": 4.265789473684211, "frac_reward_zero_std": 0.875, "grad_norm": 0.008381368592381477, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 311348166.0, "reward": 0.8707519769668579, "reward_std": 0.02324218861758709, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9609375, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.9806314706802368, "rewards/symbolic_reward_partial_score/std": 0.10037314146757126, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059480905532837, "sampling/importance_sampling_ratio/min": 0.0010475642047822475, "sampling/sampling_logp_difference/max": 6.861287593841553, "sampling/sampling_logp_difference/mean": 0.12341621518135071, "step": 1621 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2414199411869049, "epoch": 4.268421052631579, "grad_norm": 0.0014712604461237788, "learning_rate": 1e-06, "loss": -0.0005, "step": 1622 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.24441535025835037, "epoch": 4.271052631578947, "grad_norm": 0.001006963080726564, "learning_rate": 1e-06, "loss": 0.0006, "step": 1623 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23892833292484283, "epoch": 4.273684210526316, "grad_norm": 0.0006697697681374848, "learning_rate": 1e-06, "loss": -0.0006, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 279.873046875, "completions/mean_terminated_length": 279.873046875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2365439236164093, "epoch": 4.276315789473684, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0022215263452380896, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 311921157.0, "reward": 0.8255848288536072, "reward_std": 0.0127988550812006, "rewards/progression_diversity/mean": -0.00011174208339070901, "rewards/progression_diversity/std": 0.002458578674122691, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.962890625, "rewards/symbolic_reward_partial_score/std": 0.1252562403678894, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0597436428070068, "sampling/importance_sampling_ratio/min": 0.003232834627851844, "sampling/sampling_logp_difference/max": 5.734395980834961, "sampling/sampling_logp_difference/mean": 0.11962610483169556, "step": 1625 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23193275928497314, "epoch": 4.278947368421052, "grad_norm": 0.0013880907790735364, "learning_rate": 1e-06, "loss": -0.0003, "step": 1626 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2370048388838768, "epoch": 4.281578947368421, "grad_norm": 0.0017254366539418697, "learning_rate": 1e-06, "loss": -0.0002, "step": 1627 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.24163448810577393, "epoch": 4.284210526315789, "grad_norm": 0.002020586049184203, "learning_rate": 1e-06, "loss": -0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 347.330078125, "completions/mean_terminated_length": 284.4411926269531, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.23737385869026184, "epoch": 4.286842105263158, "frac_reward_zero_std": 0.875, "grad_norm": 0.0008498894167132676, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 312490446.0, "reward": 0.8547199368476868, "reward_std": 0.013046052306890488, "rewards/progression_diversity/mean": -0.0016431687399744987, "rewards/progression_diversity/std": 0.0253975298255682, "rewards/symbolic_reward_accuracy/mean": 0.93359375, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.9832357168197632, "rewards/symbolic_reward_partial_score/std": 0.07629074156284332, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0584070682525635, "sampling/importance_sampling_ratio/min": 5.360832915357605e-07, "sampling/sampling_logp_difference/max": 14.438976287841797, "sampling/sampling_logp_difference/mean": 0.1209835484623909, "step": 1629 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2417454794049263, "epoch": 4.2894736842105265, "grad_norm": 0.0014678961597383022, "learning_rate": 1e-06, "loss": 0.0259, "step": 1630 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.24222751706838608, "epoch": 4.292105263157895, "grad_norm": 0.0007775401463732123, "learning_rate": 1e-06, "loss": -0.0002, "step": 1631 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23408235609531403, "epoch": 4.294736842105263, "grad_norm": 0.00032593103242106736, "learning_rate": 1e-06, "loss": 0.0038, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 279.724609375, "completions/mean_terminated_length": 279.724609375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.23920771479606628, "epoch": 4.2973684210526315, "frac_reward_zero_std": 0.875, "grad_norm": 0.0023012347519397736, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 313035969.0, "reward": 0.7753905057907104, "reward_std": 0.024877512827515602, "rewards/progression_diversity/mean": -1.7009941075230017e-05, "rewards/progression_diversity/std": 0.00038489102735184133, "rewards/symbolic_reward_accuracy/mean": 0.826171875, "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, "rewards/symbolic_reward_partial_score/mean": 0.9322916269302368, "rewards/symbolic_reward_partial_score/std": 0.16144797205924988, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0600590705871582, "sampling/importance_sampling_ratio/min": 0.00021802130504511297, "sampling/sampling_logp_difference/max": 8.430917739868164, "sampling/sampling_logp_difference/mean": 0.1208389550447464, "step": 1633 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23538841307163239, "epoch": 4.3, "grad_norm": 0.000836519175209105, "learning_rate": 1e-06, "loss": 0.0001, "step": 1634 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23597903549671173, "epoch": 4.302631578947368, "grad_norm": 0.002698976546525955, "learning_rate": 1e-06, "loss": -0.0008, "step": 1635 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23745371401309967, "epoch": 4.3052631578947365, "grad_norm": 0.0009186923853121698, "learning_rate": 1e-06, "loss": 0.0005, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9394.0, "completions/mean_length": 328.09375, "completions/mean_terminated_length": 296.6731872558594, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.23953492939472198, "epoch": 4.307894736842106, "frac_reward_zero_std": 0.875, "grad_norm": 0.008782876655459404, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 313556529.0, "reward": 0.8749912977218628, "reward_std": 0.01434220839291811, "rewards/progression_diversity/mean": -0.0008723714272491634, "rewards/progression_diversity/std": 0.012109385803341866, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.9876302480697632, "rewards/symbolic_reward_partial_score/std": 0.07861035317182541, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0582417249679565, "sampling/importance_sampling_ratio/min": 2.171735104639083e-05, "sampling/sampling_logp_difference/max": 10.737399101257324, "sampling/sampling_logp_difference/mean": 0.12100666016340256, "step": 1637 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.24021229147911072, "epoch": 4.310526315789474, "grad_norm": 0.001116728177294135, "learning_rate": 1e-06, "loss": -0.0006, "step": 1638 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23746858537197113, "epoch": 4.313157894736842, "grad_norm": 0.0025934746954590082, "learning_rate": 1e-06, "loss": 0.0241, "step": 1639 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2366245537996292, "epoch": 4.315789473684211, "grad_norm": 0.0006494184490293264, "learning_rate": 1e-06, "loss": 0.0084, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 282.509765625, "completions/mean_terminated_length": 282.509765625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.2368258759379387, "epoch": 4.318421052631579, "frac_reward_zero_std": 1.0, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 314122326.0, "reward": 0.8359375, "reward_std": 0.0, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.90625, "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, "rewards/symbolic_reward_partial_score/mean": 0.9739583730697632, "rewards/symbolic_reward_partial_score/std": 0.08454757928848267, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059388518333435, "sampling/importance_sampling_ratio/min": 0.005422016140073538, "sampling/sampling_logp_difference/max": 5.217287540435791, "sampling/sampling_logp_difference/mean": 0.12114188820123672, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23350204527378082, "epoch": 4.321052631578947, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23761139810085297, "epoch": 4.323684210526316, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.23665422201156616, "epoch": 4.326315789473684, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 281.400390625, "completions/mean_terminated_length": 281.400390625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "entropy": 0.2357463464140892, "epoch": 4.328947368421053, "frac_reward_zero_std": 0.90625, "grad_norm": 0.002676774049177766, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 314671395.0, "reward": 0.8441406488418579, "reward_std": 0.023877985775470734, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.919921875, "rewards/symbolic_reward_accuracy/std": 0.271679550409317, "rewards/symbolic_reward_partial_score/mean": 0.9739583730697632, "rewards/symbolic_reward_partial_score/std": 0.09836674481630325, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0592198371887207, "sampling/importance_sampling_ratio/min": 2.6666658214935524e-08, "sampling/sampling_logp_difference/max": 17.439851760864258, "sampling/sampling_logp_difference/mean": 0.12135563790798187, "step": 1645 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23633001744747162, "epoch": 4.331578947368421, "grad_norm": 0.0014993188669905066, "learning_rate": 1e-06, "loss": 0.0002, "step": 1646 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23138535022735596, "epoch": 4.33421052631579, "grad_norm": 0.0018792420160025358, "learning_rate": 1e-06, "loss": -0.0004, "step": 1647 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23762395977973938, "epoch": 4.336842105263158, "grad_norm": 0.0003342384588904679, "learning_rate": 1e-06, "loss": -0.0001, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 285.7578125, "completions/mean_terminated_length": 285.7578125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.23968391120433807, "epoch": 4.339473684210526, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0011085503501817584, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 315219847.0, "reward": 0.8956515789031982, "reward_std": 0.017393916845321655, "rewards/progression_diversity/mean": -0.0002775713801383972, "rewards/progression_diversity/std": 0.004437287338078022, "rewards/symbolic_reward_accuracy/mean": 0.994140625, "rewards/symbolic_reward_accuracy/std": 0.07639661431312561, "rewards/symbolic_reward_partial_score/mean": 0.9972330331802368, "rewards/symbolic_reward_partial_score/std": 0.04610797390341759, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0596669912338257, "sampling/importance_sampling_ratio/min": 3.726242721313611e-05, "sampling/sampling_logp_difference/max": 10.197525024414062, "sampling/sampling_logp_difference/mean": 0.12217455357313156, "step": 1649 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.23926863819360733, "epoch": 4.342105263157895, "grad_norm": 0.0010118326172232628, "learning_rate": 1e-06, "loss": -0.0006, "step": 1650 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23653405904769897, "epoch": 4.344736842105263, "grad_norm": 0.0055463118478655815, "learning_rate": 1e-06, "loss": 0.0009, "step": 1651 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.24093815684318542, "epoch": 4.347368421052631, "grad_norm": 0.0011176273692399263, "learning_rate": 1e-06, "loss": 0.0002, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 284.4453125, "completions/mean_terminated_length": 284.4453125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.23320885747671127, "epoch": 4.35, "frac_reward_zero_std": 0.875, "grad_norm": 0.003535026917234063, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 315769131.0, "reward": 0.8701168298721313, "reward_std": 0.02188323438167572, "rewards/progression_diversity/mean": -4.30387444794178e-05, "rewards/progression_diversity/std": 0.0009738556109368801, "rewards/symbolic_reward_accuracy/mean": 0.95703125, "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, "rewards/symbolic_reward_partial_score/mean": 0.986328125, "rewards/symbolic_reward_partial_score/std": 0.06698986887931824, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059899926185608, "sampling/importance_sampling_ratio/min": 1.0019117553383694e-06, "sampling/sampling_logp_difference/max": 13.813600540161133, "sampling/sampling_logp_difference/mean": 0.12067143619060516, "step": 1653 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2401559054851532, "epoch": 4.352631578947369, "grad_norm": 0.001625710865482688, "learning_rate": 1e-06, "loss": -0.0007, "step": 1654 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24083483219146729, "epoch": 4.355263157894737, "grad_norm": 0.0013069827109575272, "learning_rate": 1e-06, "loss": -0.0001, "step": 1655 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23248496651649475, "epoch": 4.3578947368421055, "grad_norm": 0.003529660403728485, "learning_rate": 1e-06, "loss": 0.001, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 283.091796875, "completions/mean_terminated_length": 283.091796875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.23519539088010788, "epoch": 4.360526315789474, "frac_reward_zero_std": 0.875, "grad_norm": 0.0036818196531385183, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 316333722.0, "reward": 0.872850775718689, "reward_std": 0.014050932601094246, "rewards/progression_diversity/mean": -8.317209722008556e-05, "rewards/progression_diversity/std": 0.0014961487613618374, "rewards/symbolic_reward_accuracy/mean": 0.9609375, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.9876302480697632, "rewards/symbolic_reward_partial_score/std": 0.06307155638933182, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0588300228118896, "sampling/importance_sampling_ratio/min": 3.670607475214638e-05, "sampling/sampling_logp_difference/max": 10.212568283081055, "sampling/sampling_logp_difference/mean": 0.12060007452964783, "step": 1657 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23568804562091827, "epoch": 4.363157894736842, "grad_norm": 0.003883509198203683, "learning_rate": 1e-06, "loss": 0.0001, "step": 1658 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2364446073770523, "epoch": 4.36578947368421, "grad_norm": 0.0005405032425187528, "learning_rate": 1e-06, "loss": 0.0006, "step": 1659 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23525787144899368, "epoch": 4.368421052631579, "grad_norm": 0.0007799993618391454, "learning_rate": 1e-06, "loss": -0.0005, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 283.634765625, "completions/mean_terminated_length": 283.634765625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.23709560930728912, "epoch": 4.371052631578947, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0018411829369142652, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 316892255.0, "reward": 0.8917969465255737, "reward_std": 0.01621941104531288, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.98828125, "rewards/symbolic_reward_accuracy/std": 0.10772226005792618, "rewards/symbolic_reward_partial_score/mean": 0.99609375, "rewards/symbolic_reward_partial_score/std": 0.04019329324364662, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.058568000793457, "sampling/importance_sampling_ratio/min": 2.124160801031394e-06, "sampling/sampling_logp_difference/max": 13.0621337890625, "sampling/sampling_logp_difference/mean": 0.12094350159168243, "step": 1661 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23370085656642914, "epoch": 4.373684210526315, "grad_norm": 0.00029920003726147115, "learning_rate": 1e-06, "loss": -0.0005, "step": 1662 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23229587078094482, "epoch": 4.376315789473685, "grad_norm": 0.00032023570383898914, "learning_rate": 1e-06, "loss": -0.0002, "step": 1663 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23625388741493225, "epoch": 4.378947368421053, "grad_norm": 0.0019593036267906427, "learning_rate": 1e-06, "loss": 0.0004, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 279.814453125, "completions/mean_terminated_length": 279.814453125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.2396935299038887, "epoch": 4.381578947368421, "frac_reward_zero_std": 0.875, "grad_norm": 0.0010560675291344523, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 317459520.0, "reward": 0.8500972986221313, "reward_std": 0.01595105230808258, "rewards/progression_diversity/mean": -4.367587098386139e-05, "rewards/progression_diversity/std": 0.0008068631868809462, "rewards/symbolic_reward_accuracy/mean": 0.9296875, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.9742838144302368, "rewards/symbolic_reward_partial_score/std": 0.10590074211359024, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0598008632659912, "sampling/importance_sampling_ratio/min": 5.395947368924681e-07, "sampling/sampling_logp_difference/max": 14.43244743347168, "sampling/sampling_logp_difference/mean": 0.12156838923692703, "step": 1665 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23540012538433075, "epoch": 4.38421052631579, "grad_norm": 9.158550528809428e-05, "learning_rate": 1e-06, "loss": -0.0001, "step": 1666 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23555586487054825, "epoch": 4.386842105263158, "grad_norm": 0.0010763579048216343, "learning_rate": 1e-06, "loss": -0.0005, "step": 1667 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2318090721964836, "epoch": 4.389473684210526, "grad_norm": 0.00030530893127433956, "learning_rate": 1e-06, "loss": 0.0002, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 283.79296875, "completions/mean_terminated_length": 283.79296875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "entropy": 0.2331906408071518, "epoch": 4.3921052631578945, "frac_reward_zero_std": 0.875, "grad_norm": 0.0010331433732062578, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 318019158.0, "reward": 0.8382322788238525, "reward_std": 0.009019151329994202, "rewards/progression_diversity/mean": -1.49488914757967e-05, "rewards/progression_diversity/std": 0.0002401837264187634, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.9737955331802368, "rewards/symbolic_reward_partial_score/std": 0.09921777248382568, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.061126947402954, "sampling/importance_sampling_ratio/min": 1.443103337805951e-05, "sampling/sampling_logp_difference/max": 11.146129608154297, "sampling/sampling_logp_difference/mean": 0.11966268718242645, "step": 1669 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2346176952123642, "epoch": 4.394736842105263, "grad_norm": 0.0002629106165841222, "learning_rate": 1e-06, "loss": -0.0005, "step": 1670 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2403409779071808, "epoch": 4.397368421052631, "grad_norm": 0.0007090272847563028, "learning_rate": 1e-06, "loss": 0.0004, "step": 1671 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2349977195262909, "epoch": 4.4, "grad_norm": 0.0034831841476261616, "learning_rate": 1e-06, "loss": 0.0003, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 279.677734375, "completions/mean_terminated_length": 279.677734375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.23757142573595047, "epoch": 4.402631578947369, "frac_reward_zero_std": 0.875, "grad_norm": 0.0032499730587005615, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 318557297.0, "reward": 0.8682129383087158, "reward_std": 0.027213580906391144, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.953125, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.98779296875, "rewards/symbolic_reward_partial_score/std": 0.05615198239684105, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.058640480041504, "sampling/importance_sampling_ratio/min": 3.7312423728508293e-07, "sampling/sampling_logp_difference/max": 14.80135440826416, "sampling/sampling_logp_difference/mean": 0.1246851310133934, "step": 1673 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.24083828926086426, "epoch": 4.405263157894737, "grad_norm": 0.001479274476878345, "learning_rate": 1e-06, "loss": 0.0009, "step": 1674 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23393068462610245, "epoch": 4.407894736842105, "grad_norm": 0.0007833510753698647, "learning_rate": 1e-06, "loss": -0.0, "step": 1675 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23537438362836838, "epoch": 4.410526315789474, "grad_norm": 0.0011368156410753727, "learning_rate": 1e-06, "loss": -0.0003, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 279.228515625, "completions/mean_terminated_length": 279.228515625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "entropy": 0.23278649151325226, "epoch": 4.413157894736842, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0007415832951664925, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 319075878.0, "reward": 0.8525390625, "reward_std": 0.011718750931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.93359375, "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, "rewards/symbolic_reward_partial_score/mean": 0.974609375, "rewards/symbolic_reward_partial_score/std": 0.10058461129665375, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059082269668579, "sampling/importance_sampling_ratio/min": 0.0008237397414632142, "sampling/sampling_logp_difference/max": 7.101655960083008, "sampling/sampling_logp_difference/mean": 0.11824500560760498, "step": 1677 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23778235167264938, "epoch": 4.41578947368421, "grad_norm": 0.0005469997995533049, "learning_rate": 1e-06, "loss": -0.0003, "step": 1678 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.23452699184417725, "epoch": 4.418421052631579, "grad_norm": 0.0004806320648640394, "learning_rate": 1e-06, "loss": -0.0003, "step": 1679 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.22962932288646698, "epoch": 4.421052631578947, "grad_norm": 0.006431492045521736, "learning_rate": 1e-06, "loss": 0.0002, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 280.94921875, "completions/mean_terminated_length": 280.94921875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.2355402261018753, "epoch": 4.423684210526316, "frac_reward_zero_std": 0.875, "grad_norm": 0.0020566831808537245, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 319644044.0, "reward": 0.8218750357627869, "reward_std": 0.02611433155834675, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.884765625, "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, "rewards/symbolic_reward_partial_score/mean": 0.9700520634651184, "rewards/symbolic_reward_partial_score/std": 0.09696658700704575, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0602164268493652, "sampling/importance_sampling_ratio/min": 0.0002465007419232279, "sampling/sampling_logp_difference/max": 8.308145523071289, "sampling/sampling_logp_difference/mean": 0.1220279335975647, "step": 1681 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.24096575379371643, "epoch": 4.426315789473684, "grad_norm": 0.001502837985754013, "learning_rate": 1e-06, "loss": -0.0008, "step": 1682 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2392711043357849, "epoch": 4.428947368421053, "grad_norm": 0.0013740418944507837, "learning_rate": 1e-06, "loss": 0.0007, "step": 1683 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.24290843307971954, "epoch": 4.431578947368421, "grad_norm": 0.0006701555685140193, "learning_rate": 1e-06, "loss": 0.0001, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 280.751953125, "completions/mean_terminated_length": 280.751953125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.23609529435634613, "epoch": 4.434210526315789, "frac_reward_zero_std": 0.96875, "grad_norm": 0.004810694605112076, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 320206797.0, "reward": 0.8401367664337158, "reward_std": 0.00967772863805294, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9140625, "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, "rewards/symbolic_reward_partial_score/mean": 0.9723306894302368, "rewards/symbolic_reward_partial_score/std": 0.10186579823493958, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0596872568130493, "sampling/importance_sampling_ratio/min": 2.2254751002037665e-06, "sampling/sampling_logp_difference/max": 13.01554012298584, "sampling/sampling_logp_difference/mean": 0.1213376522064209, "step": 1685 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2319614738225937, "epoch": 4.436842105263158, "grad_norm": 0.0014053754275664687, "learning_rate": 1e-06, "loss": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.2342107743024826, "epoch": 4.439473684210526, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0003, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2317662015557289, "epoch": 4.442105263157894, "grad_norm": 0.0037717849481850863, "learning_rate": 1e-06, "loss": 0.0, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 278.859375, "completions/mean_terminated_length": 278.859375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.2374890148639679, "epoch": 4.4447368421052635, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0034204961266368628, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 320742533.0, "reward": 0.8681638836860657, "reward_std": 0.01761283539235592, "rewards/progression_diversity/mean": -2.218061126768589e-05, "rewards/progression_diversity/std": 0.0005018899100832641, "rewards/symbolic_reward_accuracy/mean": 0.951171875, "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, "rewards/symbolic_reward_partial_score/mean": 0.9915364384651184, "rewards/symbolic_reward_partial_score/std": 0.037724245339632034, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0595741271972656, "sampling/importance_sampling_ratio/min": 1.0624612514220644e-05, "sampling/sampling_logp_difference/max": 11.452337265014648, "sampling/sampling_logp_difference/mean": 0.12184470891952515, "step": 1689 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23680029064416885, "epoch": 4.447368421052632, "grad_norm": 0.00370118604041636, "learning_rate": 1e-06, "loss": 0.0005, "step": 1690 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23859335482120514, "epoch": 4.45, "grad_norm": 0.0015089736552909017, "learning_rate": 1e-06, "loss": 0.0001, "step": 1691 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2354060262441635, "epoch": 4.4526315789473685, "grad_norm": 0.0017409600550308824, "learning_rate": 1e-06, "loss": -0.0004, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 282.396484375, "completions/mean_terminated_length": 282.396484375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.2397817000746727, "epoch": 4.455263157894737, "frac_reward_zero_std": 0.90625, "grad_norm": 0.007454941049218178, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 321276912.0, "reward": 0.8334946632385254, "reward_std": 0.005865141749382019, "rewards/progression_diversity/mean": -0.0001441855274606496, "rewards/progression_diversity/std": 0.0024001228157430887, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.9619140625, "rewards/symbolic_reward_partial_score/std": 0.12517951428890228, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060306429862976, "sampling/importance_sampling_ratio/min": 7.964307769725565e-06, "sampling/sampling_logp_difference/max": 11.740540504455566, "sampling/sampling_logp_difference/mean": 0.12180732935667038, "step": 1693 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23552466928958893, "epoch": 4.457894736842105, "grad_norm": 0.0003189202107023448, "learning_rate": 1e-06, "loss": 0.0003, "step": 1694 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2354494035243988, "epoch": 4.4605263157894735, "grad_norm": 0.00015152331616263837, "learning_rate": 1e-06, "loss": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23567309230566025, "epoch": 4.463157894736842, "grad_norm": 0.00037868903018534184, "learning_rate": 1e-06, "loss": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 281.587890625, "completions/mean_terminated_length": 281.587890625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.2353179082274437, "epoch": 4.465789473684211, "frac_reward_zero_std": 0.875, "grad_norm": 0.0016322329174727201, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 321834077.0, "reward": 0.8449705839157104, "reward_std": 0.01574588194489479, "rewards/progression_diversity/mean": -1.4576362445950508e-05, "rewards/progression_diversity/std": 0.0002419299999019131, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9650065302848816, "rewards/symbolic_reward_partial_score/std": 0.1250525414943695, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0596911907196045, "sampling/importance_sampling_ratio/min": 0.0011949631152674556, "sampling/sampling_logp_difference/max": 6.729640007019043, "sampling/sampling_logp_difference/mean": 0.12178048491477966, "step": 1697 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23920729756355286, "epoch": 4.468421052631579, "grad_norm": 0.0006203987868502736, "learning_rate": 1e-06, "loss": -0.0002, "step": 1698 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23722657561302185, "epoch": 4.471052631578948, "grad_norm": 0.0048148296773433685, "learning_rate": 1e-06, "loss": 0.0007, "step": 1699 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.24049177765846252, "epoch": 4.473684210526316, "grad_norm": 0.0043615782633423805, "learning_rate": 1e-06, "loss": -0.0002, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 280.58984375, "completions/mean_terminated_length": 280.58984375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.2388734519481659, "epoch": 4.476315789473684, "frac_reward_zero_std": 0.90625, "grad_norm": 0.002722710371017456, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 322378539.0, "reward": 0.8768066763877869, "reward_std": 0.00659077987074852, "rewards/progression_diversity/mean": -1.1165892601638916e-06, "rewards/progression_diversity/std": 2.5265529984608293e-05, "rewards/symbolic_reward_accuracy/mean": 0.966796875, "rewards/symbolic_reward_accuracy/std": 0.17934183776378632, "rewards/symbolic_reward_partial_score/mean": 0.9890950918197632, "rewards/symbolic_reward_partial_score/std": 0.06236884742975235, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0597413778305054, "sampling/importance_sampling_ratio/min": 0.0002870491589419544, "sampling/sampling_logp_difference/max": 8.15585708618164, "sampling/sampling_logp_difference/mean": 0.11984318494796753, "step": 1701 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23890526592731476, "epoch": 4.478947368421053, "grad_norm": 0.0009888506028801203, "learning_rate": 1e-06, "loss": 0.0006, "step": 1702 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23294271528720856, "epoch": 4.481578947368421, "grad_norm": 0.00026438181521371007, "learning_rate": 1e-06, "loss": -0.0002, "step": 1703 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.23210904002189636, "epoch": 4.484210526315789, "grad_norm": 0.0012112339027225971, "learning_rate": 1e-06, "loss": -0.0002, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1047.0, "completions/max_terminated_length": 1047.0, "completions/mean_length": 289.240234375, "completions/mean_terminated_length": 289.240234375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "entropy": 0.2412128746509552, "epoch": 4.4868421052631575, "frac_reward_zero_std": 0.875, "grad_norm": 0.0025596783962100744, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 322939782.0, "reward": 0.866014838218689, "reward_std": 0.018494877964258194, "rewards/progression_diversity/mean": -7.871522393543273e-05, "rewards/progression_diversity/std": 0.0012731452006846666, "rewards/symbolic_reward_accuracy/mean": 0.951171875, "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, "rewards/symbolic_reward_partial_score/mean": 0.984375, "rewards/symbolic_reward_partial_score/std": 0.08390216529369354, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0600802898406982, "sampling/importance_sampling_ratio/min": 6.673198367934674e-05, "sampling/sampling_logp_difference/max": 9.614826202392578, "sampling/sampling_logp_difference/mean": 0.12298185378313065, "step": 1705 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.23964014649391174, "epoch": 4.489473684210527, "grad_norm": 0.012937691994011402, "learning_rate": 1e-06, "loss": -0.0003, "step": 1706 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.24078621715307236, "epoch": 4.492105263157895, "grad_norm": 0.002141481265425682, "learning_rate": 1e-06, "loss": 0.0001, "step": 1707 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23956668376922607, "epoch": 4.494736842105263, "grad_norm": 0.0020123261492699385, "learning_rate": 1e-06, "loss": 0.0008, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 285.2578125, "completions/mean_terminated_length": 285.2578125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.23070378601551056, "epoch": 4.497368421052632, "frac_reward_zero_std": 0.875, "grad_norm": 0.005668024532496929, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 323472618.0, "reward": 0.869824230670929, "reward_std": 0.01963612250983715, "rewards/progression_diversity/mean": -3.7124846130609512e-06, "rewards/progression_diversity/std": 8.400394290219992e-05, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9814453125, "rewards/symbolic_reward_partial_score/std": 0.09198538959026337, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060324788093567, "sampling/importance_sampling_ratio/min": 0.00014617157285101712, "sampling/sampling_logp_difference/max": 8.830729484558105, "sampling/sampling_logp_difference/mean": 0.12136264890432358, "step": 1709 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2371450588107109, "epoch": 4.5, "grad_norm": 0.0007223788998089731, "learning_rate": 1e-06, "loss": -0.0003, "step": 1710 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2418120726943016, "epoch": 4.502631578947368, "grad_norm": 0.0038238605484366417, "learning_rate": 1e-06, "loss": -0.0001, "step": 1711 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23640219867229462, "epoch": 4.505263157894737, "grad_norm": 0.0007986408309079707, "learning_rate": 1e-06, "loss": -0.0002, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 286.01171875, "completions/mean_terminated_length": 286.01171875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "entropy": 0.23386920988559723, "epoch": 4.507894736842105, "frac_reward_zero_std": 0.84375, "grad_norm": 0.01059133093804121, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 324021840.0, "reward": 0.8559077978134155, "reward_std": 0.03120303340256214, "rewards/progression_diversity/mean": -4.882874418399297e-05, "rewards/progression_diversity/std": 0.0011048683663830161, "rewards/symbolic_reward_accuracy/mean": 0.935546875, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.98193359375, "rewards/symbolic_reward_partial_score/std": 0.08151435852050781, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0577611923217773, "sampling/importance_sampling_ratio/min": 3.047924337806762e-06, "sampling/sampling_logp_difference/max": 12.7010498046875, "sampling/sampling_logp_difference/mean": 0.12323953211307526, "step": 1713 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.2332010492682457, "epoch": 4.510526315789473, "grad_norm": 0.002193406457081437, "learning_rate": 1e-06, "loss": 0.0011, "step": 1714 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.23637745529413223, "epoch": 4.5131578947368425, "grad_norm": 0.0024835586082190275, "learning_rate": 1e-06, "loss": -0.001, "step": 1715 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23958788067102432, "epoch": 4.515789473684211, "grad_norm": 0.0006318061496131122, "learning_rate": 1e-06, "loss": 0.0002, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 283.26953125, "completions/mean_terminated_length": 283.26953125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.23540818691253662, "epoch": 4.518421052631579, "frac_reward_zero_std": 0.9375, "grad_norm": 0.005175166297703981, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 324558490.0, "reward": 0.8326165676116943, "reward_std": 0.005581952165812254, "rewards/progression_diversity/mean": -6.77828211337328e-05, "rewards/progression_diversity/std": 0.0015337502118200064, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.958984375, "rewards/symbolic_reward_partial_score/std": 0.13281294703483582, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0589946508407593, "sampling/importance_sampling_ratio/min": 5.9129040892003104e-05, "sampling/sampling_logp_difference/max": 9.735788345336914, "sampling/sampling_logp_difference/mean": 0.12047428637742996, "step": 1717 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23957037180662155, "epoch": 4.521052631578947, "grad_norm": 0.0002911387709900737, "learning_rate": 1e-06, "loss": 0.0001, "step": 1718 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.23324524611234665, "epoch": 4.523684210526316, "grad_norm": 0.00015874733799137175, "learning_rate": 1e-06, "loss": 0.0001, "step": 1719 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23140724003314972, "epoch": 4.526315789473684, "grad_norm": 0.0002466792648192495, "learning_rate": 1e-06, "loss": 0.0003, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 280.705078125, "completions/mean_terminated_length": 280.705078125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.23024223744869232, "epoch": 4.528947368421052, "frac_reward_zero_std": 0.90625, "grad_norm": 0.00565887289121747, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 325127555.0, "reward": 0.8357421159744263, "reward_std": 0.021845251321792603, "rewards/progression_diversity/mean": -8.315712875628378e-06, "rewards/progression_diversity/std": 0.0001881630887510255, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.9694010019302368, "rewards/symbolic_reward_partial_score/std": 0.10498092323541641, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0591754913330078, "sampling/importance_sampling_ratio/min": 0.0007685017772018909, "sampling/sampling_logp_difference/max": 7.171067714691162, "sampling/sampling_logp_difference/mean": 0.1192249283194542, "step": 1721 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23211494088172913, "epoch": 4.531578947368421, "grad_norm": 0.0007768426439724863, "learning_rate": 1e-06, "loss": -0.0006, "step": 1722 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23332732915878296, "epoch": 4.534210526315789, "grad_norm": 0.0008748367545194924, "learning_rate": 1e-06, "loss": 0.0005, "step": 1723 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2306608259677887, "epoch": 4.536842105263158, "grad_norm": 0.003938390873372555, "learning_rate": 1e-06, "loss": 0.0006, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 314.302734375, "completions/mean_terminated_length": 282.8551940917969, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.22948867827653885, "epoch": 4.5394736842105265, "frac_reward_zero_std": 0.84375, "grad_norm": 0.006544643547385931, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 325692446.0, "reward": 0.8094171285629272, "reward_std": 0.03800683468580246, "rewards/progression_diversity/mean": -0.0006720458623021841, "rewards/progression_diversity/std": 0.014617693610489368, "rewards/symbolic_reward_accuracy/mean": 0.8671875, "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, "rewards/symbolic_reward_partial_score/mean": 0.96435546875, "rewards/symbolic_reward_partial_score/std": 0.10590612888336182, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.055977702140808, "sampling/importance_sampling_ratio/min": 0.0003501154133118689, "sampling/sampling_logp_difference/max": 7.957247734069824, "sampling/sampling_logp_difference/mean": 0.11574487388134003, "step": 1725 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.22993683815002441, "epoch": 4.542105263157895, "grad_norm": 0.0012803367571905255, "learning_rate": 1e-06, "loss": -0.0005, "step": 1726 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.22964587807655334, "epoch": 4.544736842105263, "grad_norm": 0.0012463160092011094, "learning_rate": 1e-06, "loss": -0.0006, "step": 1727 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.22539836168289185, "epoch": 4.5473684210526315, "grad_norm": 0.006871597841382027, "learning_rate": 1e-06, "loss": 0.0304, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 280.208984375, "completions/mean_terminated_length": 280.208984375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.23477299511432648, "epoch": 4.55, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0005855901399627328, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 326242889.0, "reward": 0.8384764790534973, "reward_std": 0.0075668939389288425, "rewards/progression_diversity/mean": -1.1385917787265498e-05, "rewards/progression_diversity/std": 0.0002576339466031641, "rewards/symbolic_reward_accuracy/mean": 0.91015625, "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, "rewards/symbolic_reward_partial_score/mean": 0.974609375, "rewards/symbolic_reward_partial_score/std": 0.0853847861289978, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0593574047088623, "sampling/importance_sampling_ratio/min": 0.0006416025571525097, "sampling/sampling_logp_difference/max": 7.351541519165039, "sampling/sampling_logp_difference/mean": 0.11885949969291687, "step": 1729 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2322026565670967, "epoch": 4.552631578947368, "grad_norm": 0.0004462753713596612, "learning_rate": 1e-06, "loss": 0.0002, "step": 1730 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.229575015604496, "epoch": 4.5552631578947365, "grad_norm": 0.0003073792904615402, "learning_rate": 1e-06, "loss": 0.0003, "step": 1731 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.23423191159963608, "epoch": 4.557894736842105, "grad_norm": 0.003369005862623453, "learning_rate": 1e-06, "loss": -0.0007, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 277.25390625, "completions/mean_terminated_length": 277.25390625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "entropy": 0.2312970906496048, "epoch": 4.560526315789474, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00039534809184260666, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 326781131.0, "reward": 0.85791015625, "reward_std": 0.005859375931322575, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.935546875, "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, "rewards/symbolic_reward_partial_score/mean": 0.9886067509651184, "rewards/symbolic_reward_partial_score/std": 0.04581141844391823, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0576618909835815, "sampling/importance_sampling_ratio/min": 4.253246879670769e-05, "sampling/sampling_logp_difference/max": 10.065242767333984, "sampling/sampling_logp_difference/mean": 0.11823997646570206, "step": 1733 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.23137207329273224, "epoch": 4.563157894736842, "grad_norm": 0.0002846321149263531, "learning_rate": 1e-06, "loss": -0.0001, "step": 1734 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2298935204744339, "epoch": 4.565789473684211, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0001, "step": 1735 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.22877340018749237, "epoch": 4.568421052631579, "grad_norm": 0.0003122953639831394, "learning_rate": 1e-06, "loss": 0.0004, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 283.275390625, "completions/mean_terminated_length": 283.275390625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.22834929078817368, "epoch": 4.571052631578947, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0007801002357155085, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 327328632.0, "reward": 0.8971680402755737, "reward_std": 0.01132812537252903, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.99609375, "rewards/symbolic_reward_accuracy/std": 0.06243881583213806, "rewards/symbolic_reward_partial_score/mean": 0.9983724355697632, "rewards/symbolic_reward_partial_score/std": 0.026533395051956177, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0584046840667725, "sampling/importance_sampling_ratio/min": 0.003217845456674695, "sampling/sampling_logp_difference/max": 5.739043235778809, "sampling/sampling_logp_difference/mean": 0.11800207197666168, "step": 1737 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2275266945362091, "epoch": 4.573684210526316, "grad_norm": 0.0003757645608857274, "learning_rate": 1e-06, "loss": 0.0004, "step": 1738 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23003485798835754, "epoch": 4.576315789473684, "grad_norm": 0.0006597706233151257, "learning_rate": 1e-06, "loss": -0.0003, "step": 1739 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.22901742905378342, "epoch": 4.578947368421053, "grad_norm": 0.00029348907992243767, "learning_rate": 1e-06, "loss": 0.0003, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 314.96875, "completions/mean_terminated_length": 283.5224914550781, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.2328527346253395, "epoch": 4.581578947368421, "frac_reward_zero_std": 0.75, "grad_norm": 0.006896194536238909, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 327900200.0, "reward": 0.821234941482544, "reward_std": 0.06384304165840149, "rewards/progression_diversity/mean": -0.0005336772883310914, "rewards/progression_diversity/std": 0.012075738981366158, "rewards/symbolic_reward_accuracy/mean": 0.88671875, "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, "rewards/symbolic_reward_partial_score/mean": 0.9646809697151184, "rewards/symbolic_reward_partial_score/std": 0.10966981947422028, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0578877925872803, "sampling/importance_sampling_ratio/min": 5.2641491492977366e-05, "sampling/sampling_logp_difference/max": 9.852005958557129, "sampling/sampling_logp_difference/mean": 0.11672507226467133, "step": 1741 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.2317691296339035, "epoch": 4.58421052631579, "grad_norm": 0.0015694440808147192, "learning_rate": 1e-06, "loss": 0.0009, "step": 1742 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.23245307803153992, "epoch": 4.586842105263158, "grad_norm": 0.0021039163693785667, "learning_rate": 1e-06, "loss": -0.0004, "step": 1743 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.22942954301834106, "epoch": 4.589473684210526, "grad_norm": 0.010603874921798706, "learning_rate": 1e-06, "loss": -0.0294, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 285.0625, "completions/mean_terminated_length": 285.0625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "entropy": 0.23405387252569199, "epoch": 4.592105263157895, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0022874982096254826, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 328485832.0, "reward": 0.8634766340255737, "reward_std": 0.02220808155834675, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.94921875, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.9798176884651184, "rewards/symbolic_reward_partial_score/std": 0.09398413449525833, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0588750839233398, "sampling/importance_sampling_ratio/min": 0.00381456664763391, "sampling/sampling_logp_difference/max": 5.568928241729736, "sampling/sampling_logp_difference/mean": 0.12032032012939453, "step": 1745 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2334182858467102, "epoch": 4.594736842105263, "grad_norm": 0.0004996811621822417, "learning_rate": 1e-06, "loss": 0.0005, "step": 1746 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23181022703647614, "epoch": 4.597368421052631, "grad_norm": 0.0012804733123630285, "learning_rate": 1e-06, "loss": -0.0008, "step": 1747 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.22890260815620422, "epoch": 4.6, "grad_norm": 0.0072340769693255424, "learning_rate": 1e-06, "loss": -0.0001, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 287.724609375, "completions/mean_terminated_length": 287.724609375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "entropy": 0.23200425505638123, "epoch": 4.602631578947369, "frac_reward_zero_std": 0.90625, "grad_norm": 0.001083358540199697, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 329068155.0, "reward": 0.8512207269668579, "reward_std": 0.016063103452324867, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.9296875, "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, "rewards/symbolic_reward_partial_score/mean": 0.97802734375, "rewards/symbolic_reward_partial_score/std": 0.08205181360244751, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0593247413635254, "sampling/importance_sampling_ratio/min": 0.0002779820642899722, "sampling/sampling_logp_difference/max": 8.18795394897461, "sampling/sampling_logp_difference/mean": 0.11916124075651169, "step": 1749 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23036928474903107, "epoch": 4.605263157894737, "grad_norm": 0.0007268539047800004, "learning_rate": 1e-06, "loss": -0.0001, "step": 1750 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.22929440438747406, "epoch": 4.6078947368421055, "grad_norm": 0.004047263413667679, "learning_rate": 1e-06, "loss": -0.0001, "step": 1751 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.22957557439804077, "epoch": 4.610526315789474, "grad_norm": 0.007964338175952435, "learning_rate": 1e-06, "loss": 0.0005, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 288.81640625, "completions/mean_terminated_length": 288.81640625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.23655959218740463, "epoch": 4.613157894736842, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004663840867578983, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 329621021.0, "reward": 0.8336426019668579, "reward_std": 0.026225607842206955, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.908203125, "rewards/symbolic_reward_accuracy/std": 0.289021372795105, "rewards/symbolic_reward_partial_score/mean": 0.96240234375, "rewards/symbolic_reward_partial_score/std": 0.12385479360818863, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0611376762390137, "sampling/importance_sampling_ratio/min": 1.428700124961324e-05, "sampling/sampling_logp_difference/max": 11.156160354614258, "sampling/sampling_logp_difference/mean": 0.12244156748056412, "step": 1753 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2377874031662941, "epoch": 4.61578947368421, "grad_norm": 0.0013511740835383534, "learning_rate": 1e-06, "loss": -0.0008, "step": 1754 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23934999853372574, "epoch": 4.618421052631579, "grad_norm": 0.0017129138577729464, "learning_rate": 1e-06, "loss": 0.0007, "step": 1755 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23353976756334305, "epoch": 4.621052631578947, "grad_norm": 0.0062218764796853065, "learning_rate": 1e-06, "loss": 0.0004, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 287.197265625, "completions/mean_terminated_length": 287.197265625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.23446016758680344, "epoch": 4.623684210526315, "frac_reward_zero_std": 0.78125, "grad_norm": 0.00431384751573205, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 330159682.0, "reward": 0.8695312142372131, "reward_std": 0.04663246124982834, "rewards/progression_diversity/mean": -8.157388037943747e-06, "rewards/progression_diversity/std": 0.00018458062550053, "rewards/symbolic_reward_accuracy/mean": 0.955078125, "rewards/symbolic_reward_accuracy/std": 0.20733514428138733, "rewards/symbolic_reward_partial_score/mean": 0.98828125, "rewards/symbolic_reward_partial_score/std": 0.06211148202419281, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0600439310073853, "sampling/importance_sampling_ratio/min": 0.00023238833819050342, "sampling/sampling_logp_difference/max": 8.367100715637207, "sampling/sampling_logp_difference/mean": 0.12167903035879135, "step": 1757 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.2346833571791649, "epoch": 4.626315789473685, "grad_norm": 0.0014358048792928457, "learning_rate": 1e-06, "loss": -0.0002, "step": 1758 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.109375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1328125, "entropy": 0.23761341720819473, "epoch": 4.628947368421053, "grad_norm": 0.0057918294332921505, "learning_rate": 1e-06, "loss": 0.0002, "step": 1759 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23876164853572845, "epoch": 4.631578947368421, "grad_norm": 0.0010682097636163235, "learning_rate": 1e-06, "loss": -0.0004, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 288.267578125, "completions/mean_terminated_length": 288.267578125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.236293226480484, "epoch": 4.63421052631579, "frac_reward_zero_std": 0.875, "grad_norm": 0.006198339629918337, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 330706251.0, "reward": 0.8838867545127869, "reward_std": 0.029681198298931122, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.978515625, "rewards/symbolic_reward_accuracy/std": 0.14513419568538666, "rewards/symbolic_reward_partial_score/mean": 0.9892578125, "rewards/symbolic_reward_partial_score/std": 0.07256709784269333, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060239315032959, "sampling/importance_sampling_ratio/min": 0.003650497877970338, "sampling/sampling_logp_difference/max": 5.612891674041748, "sampling/sampling_logp_difference/mean": 0.12036669254302979, "step": 1761 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23460889607667923, "epoch": 4.636842105263158, "grad_norm": 0.0026386433746665716, "learning_rate": 1e-06, "loss": -0.0003, "step": 1762 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.2347528636455536, "epoch": 4.639473684210526, "grad_norm": 0.0008613457321189344, "learning_rate": 1e-06, "loss": 0.0004, "step": 1763 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23094835877418518, "epoch": 4.6421052631578945, "grad_norm": 0.002696852432563901, "learning_rate": 1e-06, "loss": 0.0002, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 285.228515625, "completions/mean_terminated_length": 285.228515625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.23169712722301483, "epoch": 4.644736842105263, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0015485257608816028, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 331248416.0, "reward": 0.8603997230529785, "reward_std": 0.025298018008470535, "rewards/progression_diversity/mean": -7.630509207956493e-05, "rewards/progression_diversity/std": 0.0012197623727843165, "rewards/symbolic_reward_accuracy/mean": 0.939453125, "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, "rewards/symbolic_reward_partial_score/mean": 0.9890950322151184, "rewards/symbolic_reward_partial_score/std": 0.04548434168100357, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0597522258758545, "sampling/importance_sampling_ratio/min": 0.0031475864816457033, "sampling/sampling_logp_difference/max": 5.761119365692139, "sampling/sampling_logp_difference/mean": 0.12072933465242386, "step": 1765 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.23586693406105042, "epoch": 4.647368421052631, "grad_norm": 0.0018776926444843411, "learning_rate": 1e-06, "loss": 0.0006, "step": 1766 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.23478037863969803, "epoch": 4.65, "grad_norm": 0.0038558384403586388, "learning_rate": 1e-06, "loss": 0.0001, "step": 1767 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.2335403859615326, "epoch": 4.652631578947369, "grad_norm": 0.000620207458268851, "learning_rate": 1e-06, "loss": -0.0005, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 281.16015625, "completions/mean_terminated_length": 281.16015625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "entropy": 0.232573501765728, "epoch": 4.655263157894737, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0054251509718596935, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 331805362.0, "reward": 0.8727539777755737, "reward_std": 0.01990697905421257, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.962890625, "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, "rewards/symbolic_reward_partial_score/mean": 0.9833984375, "rewards/symbolic_reward_partial_score/std": 0.0878334566950798, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0596094131469727, "sampling/importance_sampling_ratio/min": 2.395567935309373e-05, "sampling/sampling_logp_difference/max": 10.639305114746094, "sampling/sampling_logp_difference/mean": 0.1198565810918808, "step": 1769 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.23547282069921494, "epoch": 4.657894736842105, "grad_norm": 0.0010475910967215896, "learning_rate": 1e-06, "loss": -0.0003, "step": 1770 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23682619631290436, "epoch": 4.660526315789474, "grad_norm": 0.0034293478820472956, "learning_rate": 1e-06, "loss": -0.0004, "step": 1771 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23524679243564606, "epoch": 4.663157894736842, "grad_norm": 0.0031075715087354183, "learning_rate": 1e-06, "loss": 0.0006, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 319.95703125, "completions/mean_terminated_length": 288.5205383300781, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "entropy": 0.24038264155387878, "epoch": 4.66578947368421, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0012618119362741709, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 332397852.0, "reward": 0.8819647431373596, "reward_std": 0.015408697538077831, "rewards/progression_diversity/mean": -0.0017699984600767493, "rewards/progression_diversity/std": 0.037862591445446014, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9913736581802368, "rewards/symbolic_reward_partial_score/std": 0.06337195634841919, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0580030679702759, "sampling/importance_sampling_ratio/min": 0.002201242372393608, "sampling/sampling_logp_difference/max": 6.1187334060668945, "sampling/sampling_logp_difference/mean": 0.12074305862188339, "step": 1773 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23519500344991684, "epoch": 4.668421052631579, "grad_norm": 0.001645031850785017, "learning_rate": 1e-06, "loss": -0.0008, "step": 1774 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23526707291603088, "epoch": 4.671052631578947, "grad_norm": 0.002648879075422883, "learning_rate": 1e-06, "loss": -0.0001, "step": 1775 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23147708177566528, "epoch": 4.673684210526316, "grad_norm": 0.0012472477974370122, "learning_rate": 1e-06, "loss": 0.0112, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 285.880859375, "completions/mean_terminated_length": 285.880859375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.23570366948843002, "epoch": 4.676315789473684, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0007892610155977309, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 332941855.0, "reward": 0.8750487565994263, "reward_std": 0.01221198309212923, "rewards/progression_diversity/mean": -1.265850369236432e-05, "rewards/progression_diversity/std": 0.0002864292182493955, "rewards/symbolic_reward_accuracy/mean": 0.962890625, "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, "rewards/symbolic_reward_partial_score/mean": 0.9910481572151184, "rewards/symbolic_reward_partial_score/std": 0.045909516513347626, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059958577156067, "sampling/importance_sampling_ratio/min": 3.310117608634755e-05, "sampling/sampling_logp_difference/max": 10.31594181060791, "sampling/sampling_logp_difference/mean": 0.12255353480577469, "step": 1777 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23493484407663345, "epoch": 4.678947368421053, "grad_norm": 0.0008538050460629165, "learning_rate": 1e-06, "loss": -0.0004, "step": 1778 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23577520996332169, "epoch": 4.681578947368421, "grad_norm": 0.004247025586664677, "learning_rate": 1e-06, "loss": 0.0001, "step": 1779 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2345782071352005, "epoch": 4.684210526315789, "grad_norm": 0.005044107791036367, "learning_rate": 1e-06, "loss": 0.0007, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 286.583984375, "completions/mean_terminated_length": 286.583984375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "entropy": 0.23718595504760742, "epoch": 4.686842105263158, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0074276975356042385, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 333467690.0, "reward": 0.8631348013877869, "reward_std": 0.034078460186719894, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.947265625, "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, "rewards/symbolic_reward_partial_score/mean": 0.9825845956802368, "rewards/symbolic_reward_partial_score/std": 0.07894845306873322, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0592341423034668, "sampling/importance_sampling_ratio/min": 4.83025869471021e-05, "sampling/sampling_logp_difference/max": 9.93802547454834, "sampling/sampling_logp_difference/mean": 0.12274422496557236, "step": 1781 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23618671298027039, "epoch": 4.689473684210526, "grad_norm": 0.00105291698127985, "learning_rate": 1e-06, "loss": -0.0009, "step": 1782 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23698270320892334, "epoch": 4.692105263157895, "grad_norm": 0.005746976938098669, "learning_rate": 1e-06, "loss": 0.0008, "step": 1783 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2383096069097519, "epoch": 4.6947368421052635, "grad_norm": 0.0012328887823969126, "learning_rate": 1e-06, "loss": -0.0006, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 285.83984375, "completions/mean_terminated_length": 285.83984375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.23322966694831848, "epoch": 4.697368421052632, "frac_reward_zero_std": 0.875, "grad_norm": 0.0064533608965575695, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 334002328.0, "reward": 0.8294922113418579, "reward_std": 0.03646354004740715, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.900390625, "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, "rewards/symbolic_reward_partial_score/mean": 0.9641926884651184, "rewards/symbolic_reward_partial_score/std": 0.11747395247220993, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0586230754852295, "sampling/importance_sampling_ratio/min": 0.000552015146240592, "sampling/sampling_logp_difference/max": 7.501935005187988, "sampling/sampling_logp_difference/mean": 0.12089669704437256, "step": 1785 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2360163927078247, "epoch": 4.7, "grad_norm": 0.0023228242062032223, "learning_rate": 1e-06, "loss": 0.0003, "step": 1786 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23313098400831223, "epoch": 4.7026315789473685, "grad_norm": 0.0005817331839352846, "learning_rate": 1e-06, "loss": -0.0, "step": 1787 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.23333580791950226, "epoch": 4.705263157894737, "grad_norm": 0.000786386604886502, "learning_rate": 1e-06, "loss": -0.001, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 281.697265625, "completions/mean_terminated_length": 281.697265625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.23130569607019424, "epoch": 4.707894736842105, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0013153174659237266, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 334544189.0, "reward": 0.8330566883087158, "reward_std": 0.030620714649558067, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.96826171875, "rewards/symbolic_reward_partial_score/std": 0.10676281899213791, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0613431930541992, "sampling/importance_sampling_ratio/min": 0.002472738502547145, "sampling/sampling_logp_difference/max": 6.002429008483887, "sampling/sampling_logp_difference/mean": 0.12073177099227905, "step": 1789 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.23753570765256882, "epoch": 4.7105263157894735, "grad_norm": 0.004040809348225594, "learning_rate": 1e-06, "loss": -0.0003, "step": 1790 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.24012085795402527, "epoch": 4.713157894736842, "grad_norm": 0.0009908534120768309, "learning_rate": 1e-06, "loss": 0.0008, "step": 1791 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2364014834165573, "epoch": 4.715789473684211, "grad_norm": 0.000720858748536557, "learning_rate": 1e-06, "loss": -0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 349.853515625, "completions/mean_terminated_length": 286.9745178222656, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "entropy": 0.2367512583732605, "epoch": 4.718421052631579, "frac_reward_zero_std": 0.90625, "grad_norm": 0.004800880327820778, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 335109778.0, "reward": 0.874405562877655, "reward_std": 0.013915155082941055, "rewards/progression_diversity/mean": -0.0008550827042199671, "rewards/progression_diversity/std": 0.01934831216931343, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.986328125, "rewards/symbolic_reward_partial_score/std": 0.08010873943567276, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0590322017669678, "sampling/importance_sampling_ratio/min": 0.004423519130796194, "sampling/sampling_logp_difference/max": 5.4208197593688965, "sampling/sampling_logp_difference/mean": 0.11889566481113434, "step": 1793 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23232899606227875, "epoch": 4.721052631578948, "grad_norm": 0.0069125681184232235, "learning_rate": 1e-06, "loss": 0.0237, "step": 1794 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23827090859413147, "epoch": 4.723684210526316, "grad_norm": 0.003237023251131177, "learning_rate": 1e-06, "loss": 0.0101, "step": 1795 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23686149716377258, "epoch": 4.726315789473684, "grad_norm": 0.0009356890805065632, "learning_rate": 1e-06, "loss": -0.0001, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 313.6484375, "completions/mean_terminated_length": 282.1996154785156, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.2309466004371643, "epoch": 4.728947368421053, "frac_reward_zero_std": 0.875, "grad_norm": 0.0016089669661596417, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 335674174.0, "reward": 0.8695645332336426, "reward_std": 0.020846091210842133, "rewards/progression_diversity/mean": -0.0015609528636559844, "rewards/progression_diversity/std": 0.03335866704583168, "rewards/symbolic_reward_accuracy/mean": 0.95703125, "rewards/symbolic_reward_accuracy/std": 0.2029850035905838, "rewards/symbolic_reward_partial_score/mean": 0.9851887822151184, "rewards/symbolic_reward_partial_score/std": 0.07982046157121658, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.057593584060669, "sampling/importance_sampling_ratio/min": 0.0018050133949145675, "sampling/sampling_logp_difference/max": 6.317187309265137, "sampling/sampling_logp_difference/mean": 0.11618360877037048, "step": 1797 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23156511783599854, "epoch": 4.731578947368421, "grad_norm": 0.0030565187335014343, "learning_rate": 1e-06, "loss": -0.0, "step": 1798 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2370528131723404, "epoch": 4.734210526315789, "grad_norm": 0.005625669378787279, "learning_rate": 1e-06, "loss": -0.0009, "step": 1799 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.22954415529966354, "epoch": 4.7368421052631575, "grad_norm": 0.002420470118522644, "learning_rate": 1e-06, "loss": 0.0118, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 284.154296875, "completions/mean_terminated_length": 284.154296875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "entropy": 0.23575779795646667, "epoch": 4.739473684210527, "frac_reward_zero_std": 0.8125, "grad_norm": 0.003084713127464056, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 336225133.0, "reward": 0.8538080453872681, "reward_std": 0.01827925443649292, "rewards/progression_diversity/mean": -5.8929272199748084e-05, "rewards/progression_diversity/std": 0.001333417254500091, "rewards/symbolic_reward_accuracy/mean": 0.931640625, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.9827474355697632, "rewards/symbolic_reward_partial_score/std": 0.07354949414730072, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0592098236083984, "sampling/importance_sampling_ratio/min": 0.00014154307427816093, "sampling/sampling_logp_difference/max": 8.862906455993652, "sampling/sampling_logp_difference/mean": 0.12110301852226257, "step": 1801 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.23429590463638306, "epoch": 4.742105263157895, "grad_norm": 0.0009401784045621753, "learning_rate": 1e-06, "loss": -0.0001, "step": 1802 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23786381632089615, "epoch": 4.744736842105263, "grad_norm": 0.005505091045051813, "learning_rate": 1e-06, "loss": 0.0001, "step": 1803 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23975226283073425, "epoch": 4.747368421052632, "grad_norm": 0.0008587277843616903, "learning_rate": 1e-06, "loss": 0.0006, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 284.486328125, "completions/mean_terminated_length": 284.486328125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "entropy": 0.2338343784213066, "epoch": 4.75, "frac_reward_zero_std": 0.875, "grad_norm": 0.001873121364042163, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 336790118.0, "reward": 0.867529034614563, "reward_std": 0.027098482474684715, "rewards/progression_diversity/mean": -3.302685945527628e-05, "rewards/progression_diversity/std": 0.0007473125006072223, "rewards/symbolic_reward_accuracy/mean": 0.953125, "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, "rewards/symbolic_reward_partial_score/mean": 0.9855143427848816, "rewards/symbolic_reward_partial_score/std": 0.06589668989181519, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.060340404510498, "sampling/importance_sampling_ratio/min": 3.4130298445234075e-05, "sampling/sampling_logp_difference/max": 10.285325050354004, "sampling/sampling_logp_difference/mean": 0.12118184566497803, "step": 1805 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23535771667957306, "epoch": 4.752631578947368, "grad_norm": 0.003209069836884737, "learning_rate": 1e-06, "loss": -0.0002, "step": 1806 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23693060129880905, "epoch": 4.755263157894737, "grad_norm": 0.006161042023450136, "learning_rate": 1e-06, "loss": -0.0004, "step": 1807 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23479554057121277, "epoch": 4.757894736842105, "grad_norm": 0.0019247831078246236, "learning_rate": 1e-06, "loss": -0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 285.146484375, "completions/mean_terminated_length": 285.146484375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.23206809908151627, "epoch": 4.760526315789473, "frac_reward_zero_std": 0.96875, "grad_norm": 0.001541185425594449, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 337325905.0, "reward": 0.8272461295127869, "reward_std": 0.01054687611758709, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.9684244394302368, "rewards/symbolic_reward_partial_score/std": 0.10192830860614777, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0599029064178467, "sampling/importance_sampling_ratio/min": 0.0008538118563592434, "sampling/sampling_logp_difference/max": 7.065799713134766, "sampling/sampling_logp_difference/mean": 0.12050032615661621, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2352423518896103, "epoch": 4.7631578947368425, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0004, "step": 1810 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2324458658695221, "epoch": 4.765789473684211, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": 0.0001, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.23431549966335297, "epoch": 4.768421052631579, "grad_norm": 0.0009463911992497742, "learning_rate": 1e-06, "loss": -0.0002, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 281.193359375, "completions/mean_terminated_length": 281.193359375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "entropy": 0.23839272558689117, "epoch": 4.771052631578947, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0013199823442846537, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 337859668.0, "reward": 0.8254392743110657, "reward_std": 0.026775745674967766, "rewards/progression_diversity/mean": -1.901627729239408e-05, "rewards/progression_diversity/std": 0.00043028921936638653, "rewards/symbolic_reward_accuracy/mean": 0.892578125, "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, "rewards/symbolic_reward_partial_score/mean": 0.96630859375, "rewards/symbolic_reward_partial_score/std": 0.10718066245317459, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0602418184280396, "sampling/importance_sampling_ratio/min": 0.0003893849498126656, "sampling/sampling_logp_difference/max": 7.850942134857178, "sampling/sampling_logp_difference/mean": 0.12159500271081924, "step": 1813 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2346992939710617, "epoch": 4.773684210526316, "grad_norm": 0.0012670259457081556, "learning_rate": 1e-06, "loss": -0.0007, "step": 1814 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.23430411517620087, "epoch": 4.776315789473684, "grad_norm": 0.001033074571751058, "learning_rate": 1e-06, "loss": -0.0, "step": 1815 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2374269664287567, "epoch": 4.778947368421052, "grad_norm": 0.0033665422815829515, "learning_rate": 1e-06, "loss": 0.001, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 289.28515625, "completions/mean_terminated_length": 289.28515625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2364349216222763, "epoch": 4.781578947368421, "frac_reward_zero_std": 0.9375, "grad_norm": 0.0023707703221589327, "learning_rate": 1e-06, "loss": -0.0001, "num_tokens": 338414278.0, "reward": 0.847900390625, "reward_std": 0.016666706651449203, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.921875, "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, "rewards/symbolic_reward_partial_score/mean": 0.9825845956802368, "rewards/symbolic_reward_partial_score/std": 0.06219496950507164, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0596914291381836, "sampling/importance_sampling_ratio/min": 0.003912740852683783, "sampling/sampling_logp_difference/max": 5.543517112731934, "sampling/sampling_logp_difference/mean": 0.12083780765533447, "step": 1817 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23182794451713562, "epoch": 4.784210526315789, "grad_norm": 0.006022021174430847, "learning_rate": 1e-06, "loss": -0.0002, "step": 1818 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.23250708729028702, "epoch": 4.786842105263158, "grad_norm": 0.0012751823524013162, "learning_rate": 1e-06, "loss": -0.0001, "step": 1819 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23640374094247818, "epoch": 4.7894736842105265, "grad_norm": 0.0018326534191146493, "learning_rate": 1e-06, "loss": 0.0002, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 288.1015625, "completions/mean_terminated_length": 288.1015625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "entropy": 0.2266254797577858, "epoch": 4.792105263157895, "frac_reward_zero_std": 0.96875, "grad_norm": 0.00011159066343680024, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 338966074.0, "reward": 0.899999737739563, "reward_std": 1.0485451866770745e-06, "rewards/progression_diversity/mean": -2.6222385713481344e-05, "rewards/progression_diversity/std": 0.0005933448555879295, "rewards/symbolic_reward_accuracy/mean": 1.0, "rewards/symbolic_reward_accuracy/std": 0.0, "rewards/symbolic_reward_partial_score/mean": 1.0, "rewards/symbolic_reward_partial_score/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0588757991790771, "sampling/importance_sampling_ratio/min": 1.982930598387611e-06, "sampling/sampling_logp_difference/max": 13.130934715270996, "sampling/sampling_logp_difference/mean": 0.11928833276033401, "step": 1821 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.23060919344425201, "epoch": 4.794736842105263, "grad_norm": 7.569075387436897e-05, "learning_rate": 1e-06, "loss": -0.0, "step": 1822 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.22907361388206482, "epoch": 4.7973684210526315, "grad_norm": 5.835884076077491e-05, "learning_rate": 1e-06, "loss": 0.0001, "step": 1823 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23327982425689697, "epoch": 4.8, "grad_norm": 0.0, "learning_rate": 1e-06, "loss": -0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 289.376953125, "completions/mean_terminated_length": 289.376953125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "entropy": 0.23341242969036102, "epoch": 4.802631578947368, "frac_reward_zero_std": 0.875, "grad_norm": 0.003994180355221033, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 339509019.0, "reward": 0.797314465045929, "reward_std": 0.03295782953500748, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.8515625, "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, "rewards/symbolic_reward_partial_score/mean": 0.95458984375, "rewards/symbolic_reward_partial_score/std": 0.12550002336502075, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0598238706588745, "sampling/importance_sampling_ratio/min": 0.0002354941243538633, "sampling/sampling_logp_difference/max": 8.353824615478516, "sampling/sampling_logp_difference/mean": 0.12100344896316528, "step": 1825 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23315101116895676, "epoch": 4.8052631578947365, "grad_norm": 0.003105239477008581, "learning_rate": 1e-06, "loss": -0.0002, "step": 1826 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2331264168024063, "epoch": 4.807894736842105, "grad_norm": 0.004227034747600555, "learning_rate": 1e-06, "loss": 0.0007, "step": 1827 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23640157282352448, "epoch": 4.810526315789474, "grad_norm": 0.006377407815307379, "learning_rate": 1e-06, "loss": -0.0006, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 291.056640625, "completions/mean_terminated_length": 291.056640625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.23511869460344315, "epoch": 4.813157894736842, "frac_reward_zero_std": 0.90625, "grad_norm": 0.00462336977943778, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 340066840.0, "reward": 0.8834961652755737, "reward_std": 0.022172415629029274, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.974609375, "rewards/symbolic_reward_accuracy/std": 0.15746226906776428, "rewards/symbolic_reward_partial_score/mean": 0.9957681894302368, "rewards/symbolic_reward_partial_score/std": 0.02624371461570263, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0585315227508545, "sampling/importance_sampling_ratio/min": 0.000580458901822567, "sampling/sampling_logp_difference/max": 7.451691627502441, "sampling/sampling_logp_difference/mean": 0.11921732127666473, "step": 1829 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23502135276794434, "epoch": 4.815789473684211, "grad_norm": 0.0011459586676210165, "learning_rate": 1e-06, "loss": 0.0007, "step": 1830 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.2287670001387596, "epoch": 4.818421052631579, "grad_norm": 0.0017157213296741247, "learning_rate": 1e-06, "loss": -0.0001, "step": 1831 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2291087955236435, "epoch": 4.821052631578947, "grad_norm": 0.0019412727560847998, "learning_rate": 1e-06, "loss": -0.0005, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 319.869140625, "completions/mean_terminated_length": 288.4324951171875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "entropy": 0.23015135526657104, "epoch": 4.823684210526316, "frac_reward_zero_std": 0.84375, "grad_norm": 0.002705089980736375, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 340650101.0, "reward": 0.8444219827651978, "reward_std": 0.034328803420066833, "rewards/progression_diversity/mean": -0.0011606995249167085, "rewards/progression_diversity/std": 0.026263633742928505, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9638671875, "rewards/symbolic_reward_partial_score/std": 0.13422636687755585, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0576014518737793, "sampling/importance_sampling_ratio/min": 1.53549427551216e-22, "sampling/sampling_logp_difference/max": 50.22801971435547, "sampling/sampling_logp_difference/mean": 0.11635151505470276, "step": 1833 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.22804827243089676, "epoch": 4.826315789473684, "grad_norm": 0.004144140984863043, "learning_rate": 1e-06, "loss": 0.0115, "step": 1834 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.2338591292500496, "epoch": 4.828947368421053, "grad_norm": 0.001100580906495452, "learning_rate": 1e-06, "loss": 0.0001, "step": 1835 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23625478148460388, "epoch": 4.831578947368421, "grad_norm": 0.011743053793907166, "learning_rate": 1e-06, "loss": -0.0011, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 289.177734375, "completions/mean_terminated_length": 289.177734375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "entropy": 0.23688583821058273, "epoch": 4.83421052631579, "frac_reward_zero_std": 0.875, "grad_norm": 0.0028815693221986294, "learning_rate": 1e-06, "loss": -0.0006, "num_tokens": 341199984.0, "reward": 0.8502440452575684, "reward_std": 0.013955610804259777, "rewards/progression_diversity/mean": -7.151393219828606e-06, "rewards/progression_diversity/std": 0.00012617984612006694, "rewards/symbolic_reward_accuracy/mean": 0.931640625, "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, "rewards/symbolic_reward_partial_score/mean": 0.9708659052848816, "rewards/symbolic_reward_partial_score/std": 0.11147594451904297, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0599719285964966, "sampling/importance_sampling_ratio/min": 0.0025296704843640327, "sampling/sampling_logp_difference/max": 5.979666233062744, "sampling/sampling_logp_difference/mean": 0.12142408639192581, "step": 1837 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.24060922861099243, "epoch": 4.836842105263158, "grad_norm": 0.0021192163694649935, "learning_rate": 1e-06, "loss": -0.0005, "step": 1838 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.23550759255886078, "epoch": 4.839473684210526, "grad_norm": 0.00041981041431427, "learning_rate": 1e-06, "loss": 0.0007, "step": 1839 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.23710917681455612, "epoch": 4.842105263157895, "grad_norm": 0.0005093511426821351, "learning_rate": 1e-06, "loss": 0.0005, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 286.017578125, "completions/mean_terminated_length": 286.017578125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "entropy": 0.23979313671588898, "epoch": 4.844736842105263, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0012512730900198221, "learning_rate": 1e-06, "loss": -0.0005, "num_tokens": 341719033.0, "reward": 0.893261730670929, "reward_std": 0.01963597722351551, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.990234375, "rewards/symbolic_reward_accuracy/std": 0.09843364357948303, "rewards/symbolic_reward_partial_score/mean": 0.9970703125, "rewards/symbolic_reward_partial_score/std": 0.031142795458436012, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059399127960205, "sampling/importance_sampling_ratio/min": 6.281732930801809e-05, "sampling/sampling_logp_difference/max": 9.67527961730957, "sampling/sampling_logp_difference/mean": 0.12148109823465347, "step": 1841 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2339087724685669, "epoch": 4.847368421052631, "grad_norm": 0.0032734405249357224, "learning_rate": 1e-06, "loss": -0.0001, "step": 1842 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23710087686777115, "epoch": 4.85, "grad_norm": 0.004084085114300251, "learning_rate": 1e-06, "loss": -0.0001, "step": 1843 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23539859056472778, "epoch": 4.852631578947369, "grad_norm": 0.007630764041095972, "learning_rate": 1e-06, "loss": 0.0009, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 291.00390625, "completions/mean_terminated_length": 291.00390625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.23795322328805923, "epoch": 4.855263157894737, "frac_reward_zero_std": 0.78125, "grad_norm": 0.003123840782791376, "learning_rate": 1e-06, "loss": -0.0004, "num_tokens": 342274683.0, "reward": 0.8268060088157654, "reward_std": 0.04585159197449684, "rewards/progression_diversity/mean": -6.512173422379419e-05, "rewards/progression_diversity/std": 0.0014735364820808172, "rewards/symbolic_reward_accuracy/mean": 0.89453125, "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, "rewards/symbolic_reward_partial_score/mean": 0.9669595956802368, "rewards/symbolic_reward_partial_score/std": 0.1075100302696228, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0592467784881592, "sampling/importance_sampling_ratio/min": 0.0003768360475078225, "sampling/sampling_logp_difference/max": 7.883700370788574, "sampling/sampling_logp_difference/mean": 0.12160350382328033, "step": 1845 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0859375, "entropy": 0.2377663478255272, "epoch": 4.8578947368421055, "grad_norm": 0.010971352458000183, "learning_rate": 1e-06, "loss": 0.0002, "step": 1846 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.23767564445734024, "epoch": 4.860526315789474, "grad_norm": 0.001147806760855019, "learning_rate": 1e-06, "loss": -0.0016, "step": 1847 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.046875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.2376321703195572, "epoch": 4.863157894736842, "grad_norm": 0.006188137922435999, "learning_rate": 1e-06, "loss": 0.0018, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 360.208984375, "completions/mean_terminated_length": 297.37060546875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "entropy": 0.22861691564321518, "epoch": 4.86578947368421, "frac_reward_zero_std": 0.90625, "grad_norm": 0.005671604536473751, "learning_rate": 1e-06, "loss": 0.0146, "num_tokens": 342854054.0, "reward": 0.8605214953422546, "reward_std": 0.029463060200214386, "rewards/progression_diversity/mean": -0.002539029810577631, "rewards/progression_diversity/std": 0.04076690971851349, "rewards/symbolic_reward_accuracy/mean": 0.94921875, "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, "rewards/symbolic_reward_partial_score/mean": 0.9713541865348816, "rewards/symbolic_reward_partial_score/std": 0.13124485313892365, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0566385984420776, "sampling/importance_sampling_ratio/min": 0.0017628967761993408, "sampling/sampling_logp_difference/max": 6.340796947479248, "sampling/sampling_logp_difference/mean": 0.11643043160438538, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23994342982769012, "epoch": 4.868421052631579, "grad_norm": 0.0025020637549459934, "learning_rate": 1e-06, "loss": 0.0007, "step": 1850 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.2400030419230461, "epoch": 4.871052631578947, "grad_norm": 0.009745350107550621, "learning_rate": 1e-06, "loss": -0.0014, "step": 1851 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.24525520205497742, "epoch": 4.873684210526315, "grad_norm": 0.0018413093639537692, "learning_rate": 1e-06, "loss": 0.0005, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 292.111328125, "completions/mean_terminated_length": 292.111328125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "entropy": 0.2327280268073082, "epoch": 4.876315789473685, "frac_reward_zero_std": 0.8125, "grad_norm": 0.0019410037202760577, "learning_rate": 1e-06, "loss": 0.0012, "num_tokens": 343422623.0, "reward": 0.8218262195587158, "reward_std": 0.03689917176961899, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.890625, "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, "rewards/symbolic_reward_partial_score/mean": 0.9581705331802368, "rewards/symbolic_reward_partial_score/std": 0.13043980300426483, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.05967378616333, "sampling/importance_sampling_ratio/min": 4.0230930608231574e-05, "sampling/sampling_logp_difference/max": 10.120874404907227, "sampling/sampling_logp_difference/mean": 0.12192998826503754, "step": 1853 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.22967851161956787, "epoch": 4.878947368421053, "grad_norm": 0.002355337841436267, "learning_rate": 1e-06, "loss": -0.0001, "step": 1854 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.09375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.23766426742076874, "epoch": 4.881578947368421, "grad_norm": 0.001805786625482142, "learning_rate": 1e-06, "loss": -0.0005, "step": 1855 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.1015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.23744701594114304, "epoch": 4.88421052631579, "grad_norm": 0.0023394892923533916, "learning_rate": 1e-06, "loss": -0.0006, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 333.83203125, "completions/mean_terminated_length": 302.4226989746094, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "entropy": 0.2477203831076622, "epoch": 4.886842105263158, "frac_reward_zero_std": 0.875, "grad_norm": 0.0036808003205806017, "learning_rate": 1e-06, "loss": -0.0007, "num_tokens": 344016041.0, "reward": 0.8699126243591309, "reward_std": 0.03976935148239136, "rewards/progression_diversity/mean": -0.0009265051339752972, "rewards/progression_diversity/std": 0.02096441760659218, "rewards/symbolic_reward_accuracy/mean": 0.9609375, "rewards/symbolic_reward_accuracy/std": 0.1939331740140915, "rewards/symbolic_reward_partial_score/mean": 0.978515625, "rewards/symbolic_reward_partial_score/std": 0.1107143834233284, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0602765083312988, "sampling/importance_sampling_ratio/min": 9.47190681443999e-09, "sampling/sampling_logp_difference/max": 18.47493553161621, "sampling/sampling_logp_difference/mean": 0.12124471366405487, "step": 1857 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23858879506587982, "epoch": 4.889473684210526, "grad_norm": 0.011941269040107727, "learning_rate": 1e-06, "loss": 0.0219, "step": 1858 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.24422086030244827, "epoch": 4.8921052631578945, "grad_norm": 0.002855083905160427, "learning_rate": 1e-06, "loss": 0.0003, "step": 1859 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.2405536025762558, "epoch": 4.894736842105263, "grad_norm": 0.0015807250747457147, "learning_rate": 1e-06, "loss": -0.0015, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 292.560546875, "completions/mean_terminated_length": 292.560546875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "entropy": 0.23888926953077316, "epoch": 4.897368421052631, "frac_reward_zero_std": 0.875, "grad_norm": 0.001256766146980226, "learning_rate": 1e-06, "loss": -0.0003, "num_tokens": 344533768.0, "reward": 0.8810058832168579, "reward_std": 0.020134469494223595, "rewards/progression_diversity/mean": -2.70133432422881e-06, "rewards/progression_diversity/std": 6.112422124715522e-05, "rewards/symbolic_reward_accuracy/mean": 0.97265625, "rewards/symbolic_reward_accuracy/std": 0.16324250400066376, "rewards/symbolic_reward_partial_score/mean": 0.9913737177848816, "rewards/symbolic_reward_partial_score/std": 0.052070844918489456, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0612703561782837, "sampling/importance_sampling_ratio/min": 1.173806595033966e-05, "sampling/sampling_logp_difference/max": 11.352673530578613, "sampling/sampling_logp_difference/mean": 0.12240561842918396, "step": 1861 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23871064186096191, "epoch": 4.9, "grad_norm": 0.0026184869930148125, "learning_rate": 1e-06, "loss": -0.0007, "step": 1862 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.23794341087341309, "epoch": 4.902631578947369, "grad_norm": 0.00047851918498054147, "learning_rate": 1e-06, "loss": 0.0007, "step": 1863 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23701441287994385, "epoch": 4.905263157894737, "grad_norm": 0.0012964674970135093, "learning_rate": 1e-06, "loss": 0.0002, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 295.33984375, "completions/mean_terminated_length": 295.33984375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "entropy": 0.2406173199415207, "epoch": 4.907894736842105, "frac_reward_zero_std": 0.90625, "grad_norm": 0.002589939162135124, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 345085782.0, "reward": 0.8155273795127869, "reward_std": 0.022172417491674423, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.880859375, "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, "rewards/symbolic_reward_partial_score/mean": 0.9567056894302368, "rewards/symbolic_reward_partial_score/std": 0.1282230019569397, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.06076979637146, "sampling/importance_sampling_ratio/min": 1.2657117167691467e-06, "sampling/sampling_logp_difference/max": 13.579875946044922, "sampling/sampling_logp_difference/mean": 0.12502209842205048, "step": 1865 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.24070727825164795, "epoch": 4.910526315789474, "grad_norm": 0.0006123611237853765, "learning_rate": 1e-06, "loss": -0.0009, "step": 1866 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23713107407093048, "epoch": 4.913157894736842, "grad_norm": 0.0053363642655313015, "learning_rate": 1e-06, "loss": 0.0008, "step": 1867 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.03125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23777218908071518, "epoch": 4.91578947368421, "grad_norm": 0.0004397969751153141, "learning_rate": 1e-06, "loss": 0.0001, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 301.1484375, "completions/mean_terminated_length": 301.1484375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "entropy": 0.24252811819314957, "epoch": 4.918421052631579, "frac_reward_zero_std": 0.78125, "grad_norm": 0.005357232876121998, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 345673154.0, "reward": 0.8431152701377869, "reward_std": 0.05576281249523163, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.91796875, "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, "rewards/symbolic_reward_partial_score/mean": 0.9744465947151184, "rewards/symbolic_reward_partial_score/std": 0.09273875504732132, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0615004301071167, "sampling/importance_sampling_ratio/min": 4.7737885324750096e-05, "sampling/sampling_logp_difference/max": 9.949785232543945, "sampling/sampling_logp_difference/mean": 0.12491629272699356, "step": 1869 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0703125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.125, "entropy": 0.2446291595697403, "epoch": 4.921052631578947, "grad_norm": 0.0016193005722016096, "learning_rate": 1e-06, "loss": 0.0007, "step": 1870 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.24189703166484833, "epoch": 4.923684210526316, "grad_norm": 0.003561503952369094, "learning_rate": 1e-06, "loss": 0.0002, "step": 1871 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.24043302237987518, "epoch": 4.926315789473684, "grad_norm": 0.008988670073449612, "learning_rate": 1e-06, "loss": -0.0017, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 841.0, "completions/max_terminated_length": 841.0, "completions/mean_length": 299.7890625, "completions/mean_terminated_length": 299.7890625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "entropy": 0.23490381240844727, "epoch": 4.928947368421053, "frac_reward_zero_std": 0.875, "grad_norm": 0.0076865507289767265, "learning_rate": 1e-06, "loss": -0.0008, "num_tokens": 346205430.0, "reward": 0.8478027582168579, "reward_std": 0.03393760323524475, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.92578125, "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, "rewards/symbolic_reward_partial_score/mean": 0.9744466543197632, "rewards/symbolic_reward_partial_score/std": 0.095624640583992, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.059721827507019, "sampling/importance_sampling_ratio/min": 0.0001972316240426153, "sampling/sampling_logp_difference/max": 8.531131744384766, "sampling/sampling_logp_difference/mean": 0.1227768063545227, "step": 1873 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125, "entropy": 0.23886170983314514, "epoch": 4.931578947368421, "grad_norm": 0.002951346104964614, "learning_rate": 1e-06, "loss": -0.0001, "step": 1874 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.09375, "entropy": 0.23480097204446793, "epoch": 4.934210526315789, "grad_norm": 0.0027505126781761646, "learning_rate": 1e-06, "loss": 0.0007, "step": 1875 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.23670527338981628, "epoch": 4.936842105263158, "grad_norm": 0.004540387075394392, "learning_rate": 1e-06, "loss": -0.0005, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 297.521484375, "completions/mean_terminated_length": 297.521484375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "entropy": 0.23834621906280518, "epoch": 4.939473684210526, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0006869042990729213, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 346775585.0, "reward": 0.876171350479126, "reward_std": 0.010939901694655418, "rewards/progression_diversity/mean": -6.002806549076922e-05, "rewards/progression_diversity/std": 0.0013582800747826695, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.9908853769302368, "rewards/symbolic_reward_partial_score/std": 0.04916610196232796, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0606482028961182, "sampling/importance_sampling_ratio/min": 0.0016635659849271178, "sampling/sampling_logp_difference/max": 6.398791790008545, "sampling/sampling_logp_difference/mean": 0.12316389381885529, "step": 1877 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23614563047885895, "epoch": 4.942105263157895, "grad_norm": 0.0005515136872418225, "learning_rate": 1e-06, "loss": 0.0005, "step": 1878 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0546875, "entropy": 0.23562929034233093, "epoch": 4.9447368421052635, "grad_norm": 0.00030168332159519196, "learning_rate": 1e-06, "loss": -0.0004, "step": 1879 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.2394574210047722, "epoch": 4.947368421052632, "grad_norm": 0.0006382480496540666, "learning_rate": 1e-06, "loss": -0.0001, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 297.638671875, "completions/mean_terminated_length": 297.638671875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "entropy": 0.23923736810684204, "epoch": 4.95, "frac_reward_zero_std": 0.8125, "grad_norm": 0.002834535436704755, "learning_rate": 1e-06, "loss": -0.0002, "num_tokens": 347340808.0, "reward": 0.8722656965255737, "reward_std": 0.04588241130113602, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.958984375, "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, "rewards/symbolic_reward_partial_score/mean": 0.9895833730697632, "rewards/symbolic_reward_partial_score/std": 0.059213150292634964, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0609382390975952, "sampling/importance_sampling_ratio/min": 0.006767674349248409, "sampling/sampling_logp_difference/max": 4.995597839355469, "sampling/sampling_logp_difference/mean": 0.12166203558444977, "step": 1881 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.109375, "entropy": 0.23699773102998734, "epoch": 4.9526315789473685, "grad_norm": 0.0013624252751469612, "learning_rate": 1e-06, "loss": 0.0005, "step": 1882 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23954419791698456, "epoch": 4.955263157894737, "grad_norm": 0.0018693705787882209, "learning_rate": 1e-06, "loss": -0.0001, "step": 1883 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0703125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23460357636213303, "epoch": 4.957894736842105, "grad_norm": 0.007947554811835289, "learning_rate": 1e-06, "loss": 0.0002, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 300.1875, "completions/mean_terminated_length": 300.1875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "entropy": 0.23470059037208557, "epoch": 4.9605263157894735, "frac_reward_zero_std": 0.8125, "grad_norm": 0.006391116417944431, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 347900136.0, "reward": 0.8125, "reward_std": 0.05671097710728645, "rewards/progression_diversity/mean": 0.0, "rewards/progression_diversity/std": 0.0, "rewards/symbolic_reward_accuracy/mean": 0.87890625, "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, "rewards/symbolic_reward_partial_score/mean": 0.9505208730697632, "rewards/symbolic_reward_partial_score/std": 0.13652312755584717, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0614619255065918, "sampling/importance_sampling_ratio/min": 4.8486199375474826e-06, "sampling/sampling_logp_difference/max": 12.23681640625, "sampling/sampling_logp_difference/mean": 0.12415052205324173, "step": 1885 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0859375, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.23707986623048782, "epoch": 4.963157894736842, "grad_norm": 0.0033761621452867985, "learning_rate": 1e-06, "loss": -0.0014, "step": 1886 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.078125, "entropy": 0.23415328562259674, "epoch": 4.965789473684211, "grad_norm": 0.006086938548833132, "learning_rate": 1e-06, "loss": 0.0003, "step": 1887 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0546875, "clip_ratio/low_mean": 0.0625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1171875, "entropy": 0.24153150618076324, "epoch": 4.968421052631579, "grad_norm": 0.0028136041946709156, "learning_rate": 1e-06, "loss": 0.0001, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 356.966796875, "completions/mean_terminated_length": 294.1156921386719, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "entropy": 0.22930483520030975, "epoch": 4.971052631578948, "frac_reward_zero_std": 0.84375, "grad_norm": 0.0019960456993430853, "learning_rate": 1e-06, "loss": 0.0095, "num_tokens": 348517751.0, "reward": 0.8305528163909912, "reward_std": 0.04029066115617752, "rewards/progression_diversity/mean": -0.0013597611105069518, "rewards/progression_diversity/std": 0.030767880380153656, "rewards/symbolic_reward_accuracy/mean": 0.904296875, "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, "rewards/symbolic_reward_partial_score/mean": 0.9612630009651184, "rewards/symbolic_reward_partial_score/std": 0.1397629827260971, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0573625564575195, "sampling/importance_sampling_ratio/min": 1.306584863414173e-06, "sampling/sampling_logp_difference/max": 13.548093795776367, "sampling/sampling_logp_difference/mean": 0.11435255408287048, "step": 1889 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.22930220514535904, "epoch": 4.973684210526316, "grad_norm": 0.002270034048706293, "learning_rate": 1e-06, "loss": 0.0295, "step": 1890 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.22606796026229858, "epoch": 4.976315789473684, "grad_norm": 0.0009876276599243283, "learning_rate": 1e-06, "loss": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0703125, "entropy": 0.23382815718650818, "epoch": 4.978947368421053, "grad_norm": 0.0011854438344016671, "learning_rate": 1e-06, "loss": 0.0001, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 349.64453125, "completions/mean_terminated_length": 286.76470947265625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "entropy": 0.2325260266661644, "epoch": 4.981578947368421, "frac_reward_zero_std": 0.90625, "grad_norm": 0.0010174971539527178, "learning_rate": 1e-06, "loss": -0.0, "num_tokens": 349086241.0, "reward": 0.8748369216918945, "reward_std": 0.014585242606699467, "rewards/progression_diversity/mean": -0.001665753312408924, "rewards/progression_diversity/std": 0.037691693753004074, "rewards/symbolic_reward_accuracy/mean": 0.96484375, "rewards/symbolic_reward_accuracy/std": 0.1843547374010086, "rewards/symbolic_reward_partial_score/mean": 0.98779296875, "rewards/symbolic_reward_partial_score/std": 0.07317520678043365, "rewards/tag_count_reward/mean": -0.00390625, "rewards/tag_count_reward/std": 0.06243881583213806, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0565659999847412, "sampling/importance_sampling_ratio/min": 8.482191333314404e-06, "sampling/sampling_logp_difference/max": 11.677541732788086, "sampling/sampling_logp_difference/mean": 0.11560506373643875, "step": 1893 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 0.22664856910705566, "epoch": 4.984210526315789, "grad_norm": 0.007991527207195759, "learning_rate": 1e-06, "loss": 0.0255, "step": 1894 }, { "clip_ratio/high_max": 1.0, "clip_ratio/high_mean": 0.0625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0625, "entropy": 0.23320794105529785, "epoch": 4.9868421052631575, "grad_norm": 0.00029770872788503766, "learning_rate": 1e-06, "loss": -0.0005, "step": 1895 }, { "clip_ratio/high_max": 0.25, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.22329071164131165, "epoch": 4.989473684210527, "grad_norm": 0.0010386735666543245, "learning_rate": 1e-06, "loss": 0.0115, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.001953125, "completions/max_length": 16384.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 323.32421875, "completions/mean_terminated_length": 291.8943176269531, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "entropy": 0.23654846101999283, "epoch": 4.992105263157895, "frac_reward_zero_std": 0.875, "grad_norm": 0.003904322162270546, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 349636743.0, "reward": 0.8332948684692383, "reward_std": 0.03343512490391731, "rewards/progression_diversity/mean": -0.0005941174458712339, "rewards/progression_diversity/std": 0.013443343341350555, "rewards/symbolic_reward_accuracy/mean": 0.90234375, "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, "rewards/symbolic_reward_partial_score/mean": 0.9736328125, "rewards/symbolic_reward_partial_score/std": 0.09050531685352325, "rewards/tag_count_reward/mean": -0.001953125, "rewards/tag_count_reward/std": 0.04419417306780815, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.058250904083252, "sampling/importance_sampling_ratio/min": 8.018063090275973e-05, "sampling/sampling_logp_difference/max": 9.431228637695312, "sampling/sampling_logp_difference/mean": 0.11942292749881744, "step": 1897 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 0.23303639888763428, "epoch": 4.994736842105263, "grad_norm": 0.0021931680385023355, "learning_rate": 1e-06, "loss": -0.0, "step": 1898 }, { "clip_ratio/high_max": 0.75, "clip_ratio/high_mean": 0.046875, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.1015625, "entropy": 0.23574919998645782, "epoch": 4.997368421052632, "grad_norm": 0.0010034493170678616, "learning_rate": 1e-06, "loss": -0.0, "step": 1899 }, { "clip_ratio/high_max": 0.5, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.046875, "entropy": 0.23388534039258957, "epoch": 5.0, "grad_norm": 0.00732083385810256, "learning_rate": 1e-06, "loss": 0.0102, "step": 1900 }, { "epoch": 5.0, "eval_clip_ratio/high_max": 0.0, "eval_clip_ratio/high_mean": 0.0, "eval_clip_ratio/low_mean": 0.0, "eval_clip_ratio/low_min": 0.0, "eval_clip_ratio/region_mean": 0.0, "eval_completions/clipped_ratio": 0.0, "eval_completions/max_length": 491.96875, "eval_completions/max_terminated_length": 491.96875, "eval_completions/mean_length": 286.003662109375, "eval_completions/mean_terminated_length": 286.003662109375, "eval_completions/min_length": 170.9375, "eval_completions/min_terminated_length": 170.9375, "eval_entropy": 0.23818373354151845, "eval_frac_reward_zero_std": 0.875, "eval_loss": 2.37619078689022e-05, "eval_num_tokens": 349636743.0, "eval_reward": 0.8882439639419317, "eval_reward_std": 0.02008125601182087, "eval_rewards/progression_diversity/mean": -7.02718230058963e-05, "eval_rewards/progression_diversity/std": 0.000760064329369925, "eval_rewards/symbolic_reward_accuracy/mean": 0.9833984375, "eval_rewards/symbolic_reward_accuracy/std": 0.07969532138668001, "eval_rewards/symbolic_reward_partial_score/mean": 0.9947509746998549, "eval_rewards/symbolic_reward_partial_score/std": 0.03326635103439912, "eval_rewards/tag_count_reward/mean": -0.002197265625, "eval_rewards/tag_count_reward/std": 0.016002451302483678, "eval_runtime": 94.9953, "eval_samples_per_second": 2.632, "eval_sampling/importance_sampling_ratio/max": 2.0, "eval_sampling/importance_sampling_ratio/mean": 1.0600753016769886, "eval_sampling/importance_sampling_ratio/min": 0.004165468698646399, "eval_sampling/sampling_logp_difference/max": 13.449735283851624, "eval_sampling/sampling_logp_difference/mean": 0.12632932304404676, "eval_steps_per_second": 0.021, "step": 1900 }, { "epoch": 5.0, "step": 1900, "total_flos": 0.0, "train_loss": 0.013345611529686698, "train_runtime": 16054.0136, "train_samples_per_second": 0.951, "train_steps_per_second": 0.118 } ], "logging_steps": 1, "max_steps": 1900, "num_input_tokens_seen": 349636743, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }