diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,36333 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.0, + "eval_steps": 500, + "global_step": 1900, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11232.0, + "completions/mean_length": 2382.95703125, + "completions/mean_terminated_length": 1960.3902587890625, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.2613159120082855, + "epoch": 0.002631578947368421, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0321967713534832, + "learning_rate": 1e-06, + "loss": 0.0443, + "num_tokens": 1623882.0, + "reward": 0.319697767496109, + "reward_std": 0.3174176812171936, + "rewards/progression_diversity/mean": -0.0028816750273108482, + "rewards/progression_diversity/std": 0.03537697717547417, + "rewards/symbolic_reward_accuracy/mean": 0.265625, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.5423176884651184, + "rewards/symbolic_reward_partial_score/std": 0.333414226770401, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0534241199493408, + "sampling/importance_sampling_ratio/min": 2.7432516327974277e-11, + "sampling/sampling_logp_difference/max": 24.319292068481445, + "sampling/sampling_logp_difference/mean": 0.10465320944786072, + "step": 1 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2668819725513458, + "epoch": 0.005263157894736842, + "grad_norm": 0.02215813286602497, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 2 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.25824375450611115, + "epoch": 0.007894736842105263, + "grad_norm": 0.02271086722612381, + "learning_rate": 1e-06, + "loss": 0.0381, + "step": 3 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.25850287079811096, + "epoch": 0.010526315789473684, + "grad_norm": 0.030293075367808342, + "learning_rate": 1e-06, + "loss": 0.0528, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13193.0, + "completions/mean_length": 2588.431640625, + "completions/mean_terminated_length": 2027.6361083984375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.2614176720380783, + "epoch": 0.013157894736842105, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.030042776837944984, + "learning_rate": 1e-06, + "loss": 0.0258, + "num_tokens": 3343623.0, + "reward": 0.38939642906188965, + "reward_std": 0.3555682599544525, + "rewards/progression_diversity/mean": -0.0007864796789363027, + "rewards/progression_diversity/std": 0.008491357788443565, + "rewards/symbolic_reward_accuracy/mean": 0.35546875, + "rewards/symbolic_reward_accuracy/std": 0.47912323474884033, + "rewards/symbolic_reward_partial_score/mean": 0.5968424081802368, + "rewards/symbolic_reward_partial_score/std": 0.35041725635528564, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.051404595375061, + "sampling/importance_sampling_ratio/min": 0.00030985596822574735, + "sampling/sampling_logp_difference/max": 8.079402923583984, + "sampling/sampling_logp_difference/mean": 0.09999503940343857, + "step": 5 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2622606158256531, + "epoch": 0.015789473684210527, + "grad_norm": 0.02478160709142685, + "learning_rate": 1e-06, + "loss": 0.0686, + "step": 6 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2552778720855713, + "epoch": 0.018421052631578946, + "grad_norm": 0.02954026311635971, + "learning_rate": 1e-06, + "loss": 0.0668, + "step": 7 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.25751645863056183, + "epoch": 0.021052631578947368, + "grad_norm": 0.026725415140390396, + "learning_rate": 1e-06, + "loss": 0.0219, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12436.0, + "completions/mean_length": 2186.072265625, + "completions/mean_terminated_length": 1816.1864013671875, + "completions/min_length": 362.0, + "completions/min_terminated_length": 362.0, + "entropy": 0.2712424546480179, + "epoch": 0.02368421052631579, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.032029978930950165, + "learning_rate": 1e-06, + "loss": 0.0221, + "num_tokens": 4856012.0, + "reward": 0.4238227605819702, + "reward_std": 0.36550843715667725, + "rewards/progression_diversity/mean": -0.0005370433209463954, + "rewards/progression_diversity/std": 0.007207411807030439, + "rewards/symbolic_reward_accuracy/mean": 0.3984375, + "rewards/symbolic_reward_accuracy/std": 0.4900552034378052, + "rewards/symbolic_reward_partial_score/mean": 0.6223958134651184, + "rewards/symbolic_reward_partial_score/std": 0.3536975085735321, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.055936574935913, + "sampling/importance_sampling_ratio/min": 0.00310147344134748, + "sampling/sampling_logp_difference/max": 5.775877952575684, + "sampling/sampling_logp_difference/mean": 0.10886339843273163, + "step": 9 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2743559181690216, + "epoch": 0.02631578947368421, + "grad_norm": 0.021439263597130775, + "learning_rate": 1e-06, + "loss": 0.032, + "step": 10 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2593652904033661, + "epoch": 0.02894736842105263, + "grad_norm": 0.030393775552511215, + "learning_rate": 1e-06, + "loss": 0.0474, + "step": 11 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2632688581943512, + "epoch": 0.031578947368421054, + "grad_norm": 0.023602856323122978, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13597.0, + "completions/mean_length": 2401.51953125, + "completions/mean_terminated_length": 2094.51904296875, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "entropy": 0.2705395370721817, + "epoch": 0.034210526315789476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.027274658903479576, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 6479734.0, + "reward": 0.4843239188194275, + "reward_std": 0.356700599193573, + "rewards/progression_diversity/mean": -0.00022850481036584824, + "rewards/progression_diversity/std": 0.0038324107881635427, + "rewards/symbolic_reward_accuracy/mean": 0.482421875, + "rewards/symbolic_reward_accuracy/std": 0.5001795887947083, + "rewards/symbolic_reward_partial_score/mean": 0.6554361581802368, + "rewards/symbolic_reward_partial_score/std": 0.37485411763191223, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0567376613616943, + "sampling/importance_sampling_ratio/min": 0.004240815062075853, + "sampling/sampling_logp_difference/max": 5.4629998207092285, + "sampling/sampling_logp_difference/mean": 0.11016078293323517, + "step": 13 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2603074461221695, + "epoch": 0.03684210526315789, + "grad_norm": 0.03649430721998215, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 14 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.2602958679199219, + "epoch": 0.039473684210526314, + "grad_norm": 0.019655603915452957, + "learning_rate": 1e-06, + "loss": 0.0592, + "step": 15 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.28125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.2742217630147934, + "epoch": 0.042105263157894736, + "grad_norm": 0.019776981323957443, + "learning_rate": 1e-06, + "loss": 0.0364, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13346.0, + "completions/mean_length": 2445.453125, + "completions/mean_terminated_length": 1966.755615234375, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 0.2584807947278023, + "epoch": 0.04473684210526316, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.04284660518169403, + "learning_rate": 1e-06, + "loss": 0.0521, + "num_tokens": 8138942.0, + "reward": 0.5522767305374146, + "reward_std": 0.32186436653137207, + "rewards/progression_diversity/mean": -0.00181739148683846, + "rewards/progression_diversity/std": 0.018342694267630577, + "rewards/symbolic_reward_accuracy/mean": 0.568359375, + "rewards/symbolic_reward_accuracy/std": 0.4957893490791321, + "rewards/symbolic_reward_partial_score/mean": 0.71337890625, + "rewards/symbolic_reward_partial_score/std": 0.37059590220451355, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.053914189338684, + "sampling/importance_sampling_ratio/min": 0.004635987337678671, + "sampling/sampling_logp_difference/max": 5.373906135559082, + "sampling/sampling_logp_difference/mean": 0.10624644160270691, + "step": 17 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.26499390602111816, + "epoch": 0.04736842105263158, + "grad_norm": 0.02529755048453808, + "learning_rate": 1e-06, + "loss": 0.038, + "step": 18 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.26124098896980286, + "epoch": 0.05, + "grad_norm": 0.027479395270347595, + "learning_rate": 1e-06, + "loss": 0.0462, + "step": 19 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2685445100069046, + "epoch": 0.05263157894736842, + "grad_norm": 0.02501635067164898, + "learning_rate": 1e-06, + "loss": 0.0469, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14541.0, + "completions/mean_length": 2491.662109375, + "completions/mean_terminated_length": 2101.1142578125, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.25292401015758514, + "epoch": 0.05526315789473684, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.03069983422756195, + "learning_rate": 1e-06, + "loss": 0.0885, + "num_tokens": 9818961.0, + "reward": 0.5678541660308838, + "reward_std": 0.30515754222869873, + "rewards/progression_diversity/mean": -0.0016935247695073485, + "rewards/progression_diversity/std": 0.024840721860527992, + "rewards/symbolic_reward_accuracy/mean": 0.591796875, + "rewards/symbolic_reward_accuracy/std": 0.49198177456855774, + "rewards/symbolic_reward_partial_score/mean": 0.7164713144302368, + "rewards/symbolic_reward_partial_score/std": 0.38369399309158325, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0556615591049194, + "sampling/importance_sampling_ratio/min": 3.9937659049577425e-16, + "sampling/sampling_logp_difference/max": 35.456626892089844, + "sampling/sampling_logp_difference/mean": 0.10836475342512131, + "step": 21 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2719355523586273, + "epoch": 0.05789473684210526, + "grad_norm": 0.02567470446228981, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 22 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2798517644405365, + "epoch": 0.060526315789473685, + "grad_norm": 0.01759127527475357, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 23 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.26892197132110596, + "epoch": 0.06315789473684211, + "grad_norm": 0.029061393812298775, + "learning_rate": 1e-06, + "loss": 0.0581, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15759.0, + "completions/mean_length": 2611.63671875, + "completions/mean_terminated_length": 2167.366943359375, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "entropy": 0.2623217850923538, + "epoch": 0.06578947368421052, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.04489253833889961, + "learning_rate": 1e-06, + "loss": 0.0677, + "num_tokens": 11567607.0, + "reward": 0.6099430322647095, + "reward_std": 0.3489864468574524, + "rewards/progression_diversity/mean": -0.0017921316903084517, + "rewards/progression_diversity/std": 0.023787712678313255, + "rewards/symbolic_reward_accuracy/mean": 0.642578125, + "rewards/symbolic_reward_accuracy/std": 0.4797092080116272, + "rewards/symbolic_reward_partial_score/mean": 0.755859375, + "rewards/symbolic_reward_partial_score/std": 0.3704371750354767, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0542800426483154, + "sampling/importance_sampling_ratio/min": 0.00033611588878557086, + "sampling/sampling_logp_difference/max": 7.998054504394531, + "sampling/sampling_logp_difference/mean": 0.10523411631584167, + "step": 25 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2667793333530426, + "epoch": 0.06842105263157895, + "grad_norm": 0.042507365345954895, + "learning_rate": 1e-06, + "loss": 0.042, + "step": 26 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2573726028203964, + "epoch": 0.07105263157894737, + "grad_norm": 0.03612668439745903, + "learning_rate": 1e-06, + "loss": 0.0948, + "step": 27 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2717868983745575, + "epoch": 0.07368421052631578, + "grad_norm": 0.03351793438196182, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15168.0, + "completions/mean_length": 2566.255859375, + "completions/mean_terminated_length": 2062.775390625, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.2697293758392334, + "epoch": 0.07631578947368421, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.036783039569854736, + "learning_rate": 1e-06, + "loss": 0.0337, + "num_tokens": 13282330.0, + "reward": 0.6279209852218628, + "reward_std": 0.30352213978767395, + "rewards/progression_diversity/mean": -0.0008747372776269913, + "rewards/progression_diversity/std": 0.01144934818148613, + "rewards/symbolic_reward_accuracy/mean": 0.666015625, + "rewards/symbolic_reward_accuracy/std": 0.47209542989730835, + "rewards/symbolic_reward_partial_score/mean": 0.771484375, + "rewards/symbolic_reward_partial_score/std": 0.36460402607917786, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0549585819244385, + "sampling/importance_sampling_ratio/min": 7.974162144819275e-05, + "sampling/sampling_logp_difference/max": 9.436718940734863, + "sampling/sampling_logp_difference/mean": 0.10604645311832428, + "step": 29 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2652689218521118, + "epoch": 0.07894736842105263, + "grad_norm": 0.031599096953868866, + "learning_rate": 1e-06, + "loss": 0.0545, + "step": 30 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2544643208384514, + "epoch": 0.08157894736842106, + "grad_norm": 0.03158700093626976, + "learning_rate": 1e-06, + "loss": 0.1139, + "step": 31 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.27170825004577637, + "epoch": 0.08421052631578947, + "grad_norm": 0.026205774396657944, + "learning_rate": 1e-06, + "loss": 0.0488, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12587.0, + "completions/mean_length": 2206.931640625, + "completions/mean_terminated_length": 1895.65869140625, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.26374557614326477, + "epoch": 0.0868421052631579, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.02841874025762081, + "learning_rate": 1e-06, + "loss": 0.0425, + "num_tokens": 14809751.0, + "reward": 0.6377917528152466, + "reward_std": 0.28174924850463867, + "rewards/progression_diversity/mean": -0.00011936978262383491, + "rewards/progression_diversity/std": 0.0016766022890806198, + "rewards/symbolic_reward_accuracy/mean": 0.677734375, + "rewards/symbolic_reward_accuracy/std": 0.46780112385749817, + "rewards/symbolic_reward_partial_score/mean": 0.7744140625, + "rewards/symbolic_reward_partial_score/std": 0.37388041615486145, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0545893907546997, + "sampling/importance_sampling_ratio/min": 0.0025524995289742947, + "sampling/sampling_logp_difference/max": 5.970682144165039, + "sampling/sampling_logp_difference/mean": 0.1066381186246872, + "step": 33 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2746908664703369, + "epoch": 0.08947368421052632, + "grad_norm": 0.033538322895765305, + "learning_rate": 1e-06, + "loss": 0.0432, + "step": 34 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2674672603607178, + "epoch": 0.09210526315789473, + "grad_norm": 0.02931833639740944, + "learning_rate": 1e-06, + "loss": 0.0377, + "step": 35 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2641746252775192, + "epoch": 0.09473684210526316, + "grad_norm": 0.038021378219127655, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11568.0, + "completions/mean_length": 2400.705078125, + "completions/mean_terminated_length": 1949.6309814453125, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.27227024734020233, + "epoch": 0.09736842105263158, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.024732865393161774, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 16416512.0, + "reward": 0.6867997050285339, + "reward_std": 0.2583235502243042, + "rewards/progression_diversity/mean": -0.0016752530355006456, + "rewards/progression_diversity/std": 0.021188421174883842, + "rewards/symbolic_reward_accuracy/mean": 0.744140625, + "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, + "rewards/symbolic_reward_partial_score/mean": 0.8095703125, + "rewards/symbolic_reward_partial_score/std": 0.3585224747657776, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0554531812667847, + "sampling/importance_sampling_ratio/min": 1.577901821292471e-05, + "sampling/sampling_logp_difference/max": 11.056829452514648, + "sampling/sampling_logp_difference/mean": 0.10788406431674957, + "step": 37 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.27088363468647003, + "epoch": 0.1, + "grad_norm": 0.0261248666793108, + "learning_rate": 1e-06, + "loss": 0.0765, + "step": 38 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.26246321201324463, + "epoch": 0.10263157894736842, + "grad_norm": 0.019391866400837898, + "learning_rate": 1e-06, + "loss": 0.0742, + "step": 39 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2687445729970932, + "epoch": 0.10526315789473684, + "grad_norm": 0.02644912153482437, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14635.0, + "completions/mean_length": 2510.658203125, + "completions/mean_terminated_length": 2091.945556640625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.26613570749759674, + "epoch": 0.10789473684210527, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.0331517830491066, + "learning_rate": 1e-06, + "loss": 0.0651, + "num_tokens": 18096273.0, + "reward": 0.6728851199150085, + "reward_std": 0.25044363737106323, + "rewards/progression_diversity/mean": -0.0015264635439962149, + "rewards/progression_diversity/std": 0.01903173141181469, + "rewards/symbolic_reward_accuracy/mean": 0.72265625, + "rewards/symbolic_reward_accuracy/std": 0.4481254518032074, + "rewards/symbolic_reward_partial_score/mean": 0.8055012822151184, + "rewards/symbolic_reward_partial_score/std": 0.3548940122127533, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.055107831954956, + "sampling/importance_sampling_ratio/min": 9.60548914008541e-06, + "sampling/sampling_logp_difference/max": 11.553175926208496, + "sampling/sampling_logp_difference/mean": 0.10785190761089325, + "step": 41 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.26286329329013824, + "epoch": 0.11052631578947368, + "grad_norm": 0.029875140637159348, + "learning_rate": 1e-06, + "loss": 0.0671, + "step": 42 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.27232979238033295, + "epoch": 0.11315789473684211, + "grad_norm": 0.026524005457758904, + "learning_rate": 1e-06, + "loss": 0.0335, + "step": 43 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.26245296001434326, + "epoch": 0.11578947368421053, + "grad_norm": 0.034007180482149124, + "learning_rate": 1e-06, + "loss": 0.0868, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10722.0, + "completions/mean_length": 2233.7578125, + "completions/mean_terminated_length": 1894.152099609375, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.2722453474998474, + "epoch": 0.11842105263157894, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.057794682681560516, + "learning_rate": 1e-06, + "loss": 0.0629, + "num_tokens": 19646613.0, + "reward": 0.6934746503829956, + "reward_std": 0.30955323576927185, + "rewards/progression_diversity/mean": -0.0031212307512760162, + "rewards/progression_diversity/std": 0.02668512612581253, + "rewards/symbolic_reward_accuracy/mean": 0.75, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.8194986581802368, + "rewards/symbolic_reward_partial_score/std": 0.3468964397907257, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0547754764556885, + "sampling/importance_sampling_ratio/min": 0.002088946523144841, + "sampling/sampling_logp_difference/max": 6.171095371246338, + "sampling/sampling_logp_difference/mean": 0.10662943124771118, + "step": 45 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2671891450881958, + "epoch": 0.12105263157894737, + "grad_norm": 0.022444499656558037, + "learning_rate": 1e-06, + "loss": 0.0136, + "step": 46 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.25909220427274704, + "epoch": 0.12368421052631579, + "grad_norm": 0.023088127374649048, + "learning_rate": 1e-06, + "loss": 0.0754, + "step": 47 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.26286621391773224, + "epoch": 0.12631578947368421, + "grad_norm": 0.026823926717042923, + "learning_rate": 1e-06, + "loss": 0.0538, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11726.0, + "completions/mean_length": 2254.619140625, + "completions/mean_terminated_length": 1769.3677978515625, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "entropy": 0.2699515223503113, + "epoch": 0.12894736842105264, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.02539858967065811, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 21198770.0, + "reward": 0.674487829208374, + "reward_std": 0.2851996421813965, + "rewards/progression_diversity/mean": -0.0023922312539070845, + "rewards/progression_diversity/std": 0.024297581985592842, + "rewards/symbolic_reward_accuracy/mean": 0.728515625, + "rewards/symbolic_reward_accuracy/std": 0.44516023993492126, + "rewards/symbolic_reward_partial_score/mean": 0.8011067509651184, + "rewards/symbolic_reward_partial_score/std": 0.36674949526786804, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0519602298736572, + "sampling/importance_sampling_ratio/min": 5.275764829005907e-16, + "sampling/sampling_logp_difference/max": 35.17823791503906, + "sampling/sampling_logp_difference/mean": 0.1013394445180893, + "step": 49 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.25432244688272476, + "epoch": 0.13157894736842105, + "grad_norm": 0.021180639043450356, + "learning_rate": 1e-06, + "loss": 0.0514, + "step": 50 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2609899342060089, + "epoch": 0.13421052631578947, + "grad_norm": 0.021488042548298836, + "learning_rate": 1e-06, + "loss": 0.097, + "step": 51 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.24922076612710953, + "epoch": 0.1368421052631579, + "grad_norm": 0.027309391647577286, + "learning_rate": 1e-06, + "loss": 0.1207, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16107.0, + "completions/mean_length": 2299.81640625, + "completions/mean_terminated_length": 1845.48779296875, + "completions/min_length": 310.0, + "completions/min_terminated_length": 310.0, + "entropy": 0.26104749739170074, + "epoch": 0.1394736842105263, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.028313223272562027, + "learning_rate": 1e-06, + "loss": 0.0445, + "num_tokens": 22783412.0, + "reward": 0.6890961527824402, + "reward_std": 0.2640804052352905, + "rewards/progression_diversity/mean": -0.0015217037871479988, + "rewards/progression_diversity/std": 0.01678471639752388, + "rewards/symbolic_reward_accuracy/mean": 0.740234375, + "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, + "rewards/symbolic_reward_partial_score/mean": 0.82568359375, + "rewards/symbolic_reward_partial_score/std": 0.3356630504131317, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.052910566329956, + "sampling/importance_sampling_ratio/min": 7.863413884479087e-07, + "sampling/sampling_logp_difference/max": 14.055874824523926, + "sampling/sampling_logp_difference/mean": 0.10275271534919739, + "step": 53 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.26632043719291687, + "epoch": 0.14210526315789473, + "grad_norm": 0.029039481654763222, + "learning_rate": 1e-06, + "loss": 0.0791, + "step": 54 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2686396688222885, + "epoch": 0.14473684210526316, + "grad_norm": 0.028012612834572792, + "learning_rate": 1e-06, + "loss": 0.0624, + "step": 55 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.25890131294727325, + "epoch": 0.14736842105263157, + "grad_norm": 0.025674991309642792, + "learning_rate": 1e-06, + "loss": 0.086, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12780.0, + "completions/mean_length": 2413.14453125, + "completions/mean_terminated_length": 1874.7139892578125, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "entropy": 0.25308099389076233, + "epoch": 0.15, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.04013952985405922, + "learning_rate": 1e-06, + "loss": 0.0407, + "num_tokens": 24430270.0, + "reward": 0.6752703189849854, + "reward_std": 0.308138370513916, + "rewards/progression_diversity/mean": -0.002267459873110056, + "rewards/progression_diversity/std": 0.02433175779879093, + "rewards/symbolic_reward_accuracy/mean": 0.7265625, + "rewards/symbolic_reward_accuracy/std": 0.4461594223976135, + "rewards/symbolic_reward_partial_score/mean": 0.8076171875, + "rewards/symbolic_reward_partial_score/std": 0.35207507014274597, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0503523349761963, + "sampling/importance_sampling_ratio/min": 7.612612193952373e-07, + "sampling/sampling_logp_difference/max": 14.088289260864258, + "sampling/sampling_logp_difference/mean": 0.09810857474803925, + "step": 57 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2614179700613022, + "epoch": 0.15263157894736842, + "grad_norm": 0.03701889514923096, + "learning_rate": 1e-06, + "loss": 0.0238, + "step": 58 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.24882755428552628, + "epoch": 0.15526315789473685, + "grad_norm": 0.0272480770945549, + "learning_rate": 1e-06, + "loss": 0.1082, + "step": 59 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.25699031352996826, + "epoch": 0.15789473684210525, + "grad_norm": 0.03054182417690754, + "learning_rate": 1e-06, + "loss": 0.0537, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16104.0, + "completions/mean_length": 2519.41796875, + "completions/mean_terminated_length": 1955.8170166015625, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.2684636861085892, + "epoch": 0.16052631578947368, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.028775321319699287, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 26120852.0, + "reward": 0.7042874693870544, + "reward_std": 0.26989448070526123, + "rewards/progression_diversity/mean": -0.0009394375374540687, + "rewards/progression_diversity/std": 0.01054183766245842, + "rewards/symbolic_reward_accuracy/mean": 0.767578125, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.8235676884651184, + "rewards/symbolic_reward_partial_score/std": 0.3544165790081024, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0519888401031494, + "sampling/importance_sampling_ratio/min": 0.0002099307457683608, + "sampling/sampling_logp_difference/max": 8.468732833862305, + "sampling/sampling_logp_difference/mean": 0.10232645273208618, + "step": 61 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.25643981993198395, + "epoch": 0.1631578947368421, + "grad_norm": 0.03552790358662605, + "learning_rate": 1e-06, + "loss": 0.1004, + "step": 62 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2648386210203171, + "epoch": 0.16578947368421051, + "grad_norm": 0.032785214483737946, + "learning_rate": 1e-06, + "loss": 0.065, + "step": 63 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.25679877400398254, + "epoch": 0.16842105263157894, + "grad_norm": 0.027879441156983376, + "learning_rate": 1e-06, + "loss": 0.0989, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14055.0, + "completions/mean_length": 3021.4609375, + "completions/mean_terminated_length": 2160.2578125, + "completions/min_length": 387.0, + "completions/min_terminated_length": 387.0, + "entropy": 0.2547323405742645, + "epoch": 0.17105263157894737, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.036476247012615204, + "learning_rate": 1e-06, + "loss": 0.0497, + "num_tokens": 28101184.0, + "reward": 0.6189697980880737, + "reward_std": 0.3368384838104248, + "rewards/progression_diversity/mean": -0.002434882801026106, + "rewards/progression_diversity/std": 0.020372329279780388, + "rewards/symbolic_reward_accuracy/mean": 0.66796875, + "rewards/symbolic_reward_accuracy/std": 0.47140273451805115, + "rewards/symbolic_reward_partial_score/mean": 0.7449544072151184, + "rewards/symbolic_reward_partial_score/std": 0.4009448289871216, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.049661636352539, + "sampling/importance_sampling_ratio/min": 8.808132175019967e-13, + "sampling/sampling_logp_difference/max": 27.757930755615234, + "sampling/sampling_logp_difference/mean": 0.0967579260468483, + "step": 65 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.24418476969003677, + "epoch": 0.1736842105263158, + "grad_norm": 0.02933136560022831, + "learning_rate": 1e-06, + "loss": 0.0656, + "step": 66 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2377062812447548, + "epoch": 0.1763157894736842, + "grad_norm": 0.028272006660699844, + "learning_rate": 1e-06, + "loss": 0.1092, + "step": 67 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.251356840133667, + "epoch": 0.17894736842105263, + "grad_norm": 0.03293720260262489, + "learning_rate": 1e-06, + "loss": 0.0516, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11239.0, + "completions/mean_length": 2362.73046875, + "completions/mean_terminated_length": 1822.35693359375, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.2604978382587433, + "epoch": 0.18157894736842106, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.031264252960681915, + "learning_rate": 1e-06, + "loss": 0.0868, + "num_tokens": 29700854.0, + "reward": 0.7232824563980103, + "reward_std": 0.27414870262145996, + "rewards/progression_diversity/mean": -0.0008577151456847787, + "rewards/progression_diversity/std": 0.009135694243013859, + "rewards/symbolic_reward_accuracy/mean": 0.783203125, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.85498046875, + "rewards/symbolic_reward_partial_score/std": 0.3152819871902466, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0526225566864014, + "sampling/importance_sampling_ratio/min": 1.6764071233410505e-06, + "sampling/sampling_logp_difference/max": 13.298857688903809, + "sampling/sampling_logp_difference/mean": 0.10294780135154724, + "step": 69 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2632171958684921, + "epoch": 0.18421052631578946, + "grad_norm": 0.0390724278986454, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 70 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2604610174894333, + "epoch": 0.1868421052631579, + "grad_norm": 0.03644363954663277, + "learning_rate": 1e-06, + "loss": 0.1041, + "step": 71 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2630729079246521, + "epoch": 0.18947368421052632, + "grad_norm": 0.02937830239534378, + "learning_rate": 1e-06, + "loss": 0.0363, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12593.0, + "completions/mean_length": 2441.689453125, + "completions/mean_terminated_length": 1845.37890625, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.2574312090873718, + "epoch": 0.19210526315789472, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.025905072689056396, + "learning_rate": 1e-06, + "loss": 0.0491, + "num_tokens": 31351639.0, + "reward": 0.724668025970459, + "reward_std": 0.24668556451797485, + "rewards/progression_diversity/mean": -0.003901800373569131, + "rewards/progression_diversity/std": 0.030434370040893555, + "rewards/symbolic_reward_accuracy/mean": 0.7890625, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.8499348759651184, + "rewards/symbolic_reward_partial_score/std": 0.3279423713684082, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0521684885025024, + "sampling/importance_sampling_ratio/min": 1.6198048111149e-11, + "sampling/sampling_logp_difference/max": 24.84613037109375, + "sampling/sampling_logp_difference/mean": 0.10166233777999878, + "step": 73 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2581082284450531, + "epoch": 0.19473684210526315, + "grad_norm": 0.022781765088438988, + "learning_rate": 1e-06, + "loss": 0.0576, + "step": 74 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2618667483329773, + "epoch": 0.19736842105263158, + "grad_norm": 0.02607133612036705, + "learning_rate": 1e-06, + "loss": 0.0836, + "step": 75 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.25442183017730713, + "epoch": 0.2, + "grad_norm": 0.019913366064429283, + "learning_rate": 1e-06, + "loss": 0.0734, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14265.0, + "completions/mean_length": 2636.556640625, + "completions/mean_terminated_length": 1930.8358154296875, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.2596082091331482, + "epoch": 0.2026315789473684, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.02666478045284748, + "learning_rate": 1e-06, + "loss": 0.0508, + "num_tokens": 33113044.0, + "reward": 0.6970432996749878, + "reward_std": 0.27402693033218384, + "rewards/progression_diversity/mean": -0.0027059155981987715, + "rewards/progression_diversity/std": 0.023960862308740616, + "rewards/symbolic_reward_accuracy/mean": 0.75, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.837890625, + "rewards/symbolic_reward_partial_score/std": 0.3269626796245575, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0499635934829712, + "sampling/importance_sampling_ratio/min": 5.463769866764778e-06, + "sampling/sampling_logp_difference/max": 12.117371559143066, + "sampling/sampling_logp_difference/mean": 0.09771312028169632, + "step": 77 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2613191306591034, + "epoch": 0.20526315789473684, + "grad_norm": 0.025247380137443542, + "learning_rate": 1e-06, + "loss": 0.0371, + "step": 78 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2529866099357605, + "epoch": 0.20789473684210527, + "grad_norm": 0.030076855793595314, + "learning_rate": 1e-06, + "loss": 0.0572, + "step": 79 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.24038981646299362, + "epoch": 0.21052631578947367, + "grad_norm": 0.0339297391474247, + "learning_rate": 1e-06, + "loss": 0.1197, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8195.0, + "completions/mean_length": 2479.08984375, + "completions/mean_terminated_length": 1765.2855224609375, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "entropy": 0.2412530928850174, + "epoch": 0.2131578947368421, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.02495790272951126, + "learning_rate": 1e-06, + "loss": 0.0973, + "num_tokens": 34795330.0, + "reward": 0.7531094551086426, + "reward_std": 0.24608904123306274, + "rewards/progression_diversity/mean": -0.0015581449260935187, + "rewards/progression_diversity/std": 0.013487322255969048, + "rewards/symbolic_reward_accuracy/mean": 0.82421875, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.8756510019302368, + "rewards/symbolic_reward_partial_score/std": 0.30089128017425537, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0499427318572998, + "sampling/importance_sampling_ratio/min": 4.466129244207195e-09, + "sampling/sampling_logp_difference/max": 19.226743698120117, + "sampling/sampling_logp_difference/mean": 0.09754176437854767, + "step": 81 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.2646462917327881, + "epoch": 0.21578947368421053, + "grad_norm": 0.02527514286339283, + "learning_rate": 1e-06, + "loss": 0.051, + "step": 82 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2567380368709564, + "epoch": 0.21842105263157896, + "grad_norm": 0.032149430364370346, + "learning_rate": 1e-06, + "loss": 0.0786, + "step": 83 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.25139138102531433, + "epoch": 0.22105263157894736, + "grad_norm": 0.03584575280547142, + "learning_rate": 1e-06, + "loss": 0.078, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14563.0, + "completions/mean_length": 2363.416015625, + "completions/mean_terminated_length": 1793.4735107421875, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.2689914107322693, + "epoch": 0.2236842105263158, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.028427930548787117, + "learning_rate": 1e-06, + "loss": 0.0551, + "num_tokens": 36393687.0, + "reward": 0.7451353073120117, + "reward_std": 0.23895391821861267, + "rewards/progression_diversity/mean": -0.003074061591178179, + "rewards/progression_diversity/std": 0.030696000903844833, + "rewards/symbolic_reward_accuracy/mean": 0.8125, + "rewards/symbolic_reward_accuracy/std": 0.39069411158561707, + "rewards/symbolic_reward_partial_score/mean": 0.87060546875, + "rewards/symbolic_reward_partial_score/std": 0.3043627142906189, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0520421266555786, + "sampling/importance_sampling_ratio/min": 0.0007652377826161683, + "sampling/sampling_logp_difference/max": 7.175323963165283, + "sampling/sampling_logp_difference/mean": 0.10205866396427155, + "step": 85 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.26435205340385437, + "epoch": 0.22631578947368422, + "grad_norm": 0.02536817453801632, + "learning_rate": 1e-06, + "loss": 0.0536, + "step": 86 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.26483944058418274, + "epoch": 0.22894736842105262, + "grad_norm": 0.029015418142080307, + "learning_rate": 1e-06, + "loss": 0.0774, + "step": 87 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.24875866621732712, + "epoch": 0.23157894736842105, + "grad_norm": 0.04244063422083855, + "learning_rate": 1e-06, + "loss": 0.1163, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13828.0, + "completions/mean_length": 2104.162109375, + "completions/mean_terminated_length": 1673.1810302734375, + "completions/min_length": 330.0, + "completions/min_terminated_length": 330.0, + "entropy": 0.26499253511428833, + "epoch": 0.23421052631578948, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.018304875120520592, + "learning_rate": 1e-06, + "loss": 0.0338, + "num_tokens": 37858282.0, + "reward": 0.785420298576355, + "reward_std": 0.17559418082237244, + "rewards/progression_diversity/mean": -0.002899130806326866, + "rewards/progression_diversity/std": 0.025376563891768456, + "rewards/symbolic_reward_accuracy/mean": 0.857421875, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.9124348759651184, + "rewards/symbolic_reward_partial_score/std": 0.25231799483299255, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.052797794342041, + "sampling/importance_sampling_ratio/min": 3.9022021169898835e-09, + "sampling/sampling_logp_difference/max": 19.361724853515625, + "sampling/sampling_logp_difference/mean": 0.10318418592214584, + "step": 89 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2627365291118622, + "epoch": 0.23684210526315788, + "grad_norm": 0.012172169052064419, + "learning_rate": 1e-06, + "loss": 0.0412, + "step": 90 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2614479660987854, + "epoch": 0.2394736842105263, + "grad_norm": 0.033741675317287445, + "learning_rate": 1e-06, + "loss": 0.1004, + "step": 91 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.26432569324970245, + "epoch": 0.24210526315789474, + "grad_norm": 0.024774247780442238, + "learning_rate": 1e-06, + "loss": 0.0617, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12066.0, + "completions/mean_length": 2051.4296875, + "completions/mean_terminated_length": 1589.088623046875, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "entropy": 0.2490312159061432, + "epoch": 0.24473684210526317, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.035675231367349625, + "learning_rate": 1e-06, + "loss": 0.0635, + "num_tokens": 39296742.0, + "reward": 0.7815226316452026, + "reward_std": 0.21832206845283508, + "rewards/progression_diversity/mean": -0.002041286788880825, + "rewards/progression_diversity/std": 0.023288603872060776, + "rewards/symbolic_reward_accuracy/mean": 0.857421875, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.9000650644302368, + "rewards/symbolic_reward_partial_score/std": 0.2744283676147461, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0516581535339355, + "sampling/importance_sampling_ratio/min": 8.132886725187305e-13, + "sampling/sampling_logp_difference/max": 27.837690353393555, + "sampling/sampling_logp_difference/mean": 0.10091866552829742, + "step": 93 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2668968141078949, + "epoch": 0.24736842105263157, + "grad_norm": 0.012349041178822517, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 94 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.25852371752262115, + "epoch": 0.25, + "grad_norm": 0.030288243666291237, + "learning_rate": 1e-06, + "loss": 0.0817, + "step": 95 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.25272224843502045, + "epoch": 0.25263157894736843, + "grad_norm": 0.037656910717487335, + "learning_rate": 1e-06, + "loss": 0.0626, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12648.0, + "completions/mean_length": 1926.58984375, + "completions/mean_terminated_length": 1638.5936279296875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.2652004808187485, + "epoch": 0.25526315789473686, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.040096741169691086, + "learning_rate": 1e-06, + "loss": 0.0186, + "num_tokens": 40685300.0, + "reward": 0.7350056171417236, + "reward_std": 0.2433113306760788, + "rewards/progression_diversity/mean": -0.00041471776785328984, + "rewards/progression_diversity/std": 0.008273917250335217, + "rewards/symbolic_reward_accuracy/mean": 0.798828125, + "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, + "rewards/symbolic_reward_partial_score/mean": 0.8575846552848816, + "rewards/symbolic_reward_partial_score/std": 0.31621140241622925, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0552291870117188, + "sampling/importance_sampling_ratio/min": 8.425282566342125e-15, + "sampling/sampling_logp_difference/max": 32.40753936767578, + "sampling/sampling_logp_difference/mean": 0.10859895497560501, + "step": 97 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.26686596870422363, + "epoch": 0.2578947368421053, + "grad_norm": 0.014504051767289639, + "learning_rate": 1e-06, + "loss": 0.0306, + "step": 98 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.252104789018631, + "epoch": 0.26052631578947366, + "grad_norm": 0.025143684819340706, + "learning_rate": 1e-06, + "loss": 0.0521, + "step": 99 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.25955578684806824, + "epoch": 0.2631578947368421, + "grad_norm": 0.028664739802479744, + "learning_rate": 1e-06, + "loss": 0.0281, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11806.0, + "completions/mean_length": 2146.349609375, + "completions/mean_terminated_length": 1804.6461181640625, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "entropy": 0.25793255865573883, + "epoch": 0.2657894736842105, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.04812149703502655, + "learning_rate": 1e-06, + "loss": 0.0663, + "num_tokens": 42194055.0, + "reward": 0.7436953783035278, + "reward_std": 0.23294387757778168, + "rewards/progression_diversity/mean": -0.0005821330123580992, + "rewards/progression_diversity/std": 0.009706917218863964, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.8722330331802368, + "rewards/symbolic_reward_partial_score/std": 0.2982480227947235, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0527888536453247, + "sampling/importance_sampling_ratio/min": 3.632110292528523e-06, + "sampling/sampling_logp_difference/max": 12.525696754455566, + "sampling/sampling_logp_difference/mean": 0.10401815176010132, + "step": 101 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2468215674161911, + "epoch": 0.26842105263157895, + "grad_norm": 0.03846008703112602, + "learning_rate": 1e-06, + "loss": 0.0883, + "step": 102 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2601003050804138, + "epoch": 0.2710526315789474, + "grad_norm": 0.017417294904589653, + "learning_rate": 1e-06, + "loss": 0.0133, + "step": 103 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2558285742998123, + "epoch": 0.2736842105263158, + "grad_norm": 0.023298203945159912, + "learning_rate": 1e-06, + "loss": 0.0655, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14029.0, + "completions/mean_length": 1896.59375, + "completions/mean_terminated_length": 1608.0, + "completions/min_length": 379.0, + "completions/min_terminated_length": 379.0, + "entropy": 0.25534459948539734, + "epoch": 0.27631578947368424, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.03245214372873306, + "learning_rate": 1e-06, + "loss": 0.0691, + "num_tokens": 43574071.0, + "reward": 0.7646946907043457, + "reward_std": 0.20825007557868958, + "rewards/progression_diversity/mean": -0.00025633774930611253, + "rewards/progression_diversity/std": 0.0033328270073980093, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.87841796875, + "rewards/symbolic_reward_partial_score/std": 0.30651092529296875, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0528056621551514, + "sampling/importance_sampling_ratio/min": 0.0034063623752444983, + "sampling/sampling_logp_difference/max": 5.68211030960083, + "sampling/sampling_logp_difference/mean": 0.10405570268630981, + "step": 105 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.25604379177093506, + "epoch": 0.2789473684210526, + "grad_norm": 0.01872321590781212, + "learning_rate": 1e-06, + "loss": 0.0386, + "step": 106 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.25383952260017395, + "epoch": 0.28157894736842104, + "grad_norm": 0.019688135012984276, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 107 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.24521780759096146, + "epoch": 0.28421052631578947, + "grad_norm": 0.032386377453804016, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14031.0, + "completions/mean_length": 2017.2890625, + "completions/mean_terminated_length": 1731.099609375, + "completions/min_length": 313.0, + "completions/min_terminated_length": 313.0, + "entropy": 0.2612617313861847, + "epoch": 0.2868421052631579, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.0264718197286129, + "learning_rate": 1e-06, + "loss": 0.0199, + "num_tokens": 45012075.0, + "reward": 0.7735224366188049, + "reward_std": 0.20979472994804382, + "rewards/progression_diversity/mean": -0.0012756988871842623, + "rewards/progression_diversity/std": 0.01783159375190735, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.8876953125, + "rewards/symbolic_reward_partial_score/std": 0.29055485129356384, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0521562099456787, + "sampling/importance_sampling_ratio/min": 5.862594480277039e-05, + "sampling/sampling_logp_difference/max": 9.744333267211914, + "sampling/sampling_logp_difference/mean": 0.10334809869527817, + "step": 109 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.25815196335315704, + "epoch": 0.2894736842105263, + "grad_norm": 0.021397482603788376, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 110 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2469303384423256, + "epoch": 0.29210526315789476, + "grad_norm": 0.028907410800457, + "learning_rate": 1e-06, + "loss": 0.1008, + "step": 111 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.24571603536605835, + "epoch": 0.29473684210526313, + "grad_norm": 0.043796975165605545, + "learning_rate": 1e-06, + "loss": 0.0896, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12841.0, + "completions/mean_length": 1870.52734375, + "completions/mean_terminated_length": 1581.4144287109375, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.2544310688972473, + "epoch": 0.29736842105263156, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.02333790808916092, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 46372409.0, + "reward": 0.7861639857292175, + "reward_std": 0.2023507058620453, + "rewards/progression_diversity/mean": -0.0017700331518426538, + "rewards/progression_diversity/std": 0.019489416852593422, + "rewards/symbolic_reward_accuracy/mean": 0.865234375, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.8953450322151184, + "rewards/symbolic_reward_partial_score/std": 0.28922733664512634, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0533685684204102, + "sampling/importance_sampling_ratio/min": 2.793473868223373e-05, + "sampling/sampling_logp_difference/max": 10.485639572143555, + "sampling/sampling_logp_difference/mean": 0.10594035685062408, + "step": 113 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2571485936641693, + "epoch": 0.3, + "grad_norm": 0.033841848373413086, + "learning_rate": 1e-06, + "loss": 0.0585, + "step": 114 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.24314817041158676, + "epoch": 0.3026315789473684, + "grad_norm": 0.022232208400964737, + "learning_rate": 1e-06, + "loss": 0.1467, + "step": 115 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.26761798560619354, + "epoch": 0.30526315789473685, + "grad_norm": 0.012662280350923538, + "learning_rate": 1e-06, + "loss": 0.0325, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12353.0, + "completions/mean_length": 1801.455078125, + "completions/mean_terminated_length": 1628.53955078125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.24800319969654083, + "epoch": 0.3078947368421053, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.021536562591791153, + "learning_rate": 1e-06, + "loss": 0.0693, + "num_tokens": 47720738.0, + "reward": 0.7720678448677063, + "reward_std": 0.22057949006557465, + "rewards/progression_diversity/mean": -0.0002511652419343591, + "rewards/progression_diversity/std": 0.004545476287603378, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.9055989980697632, + "rewards/symbolic_reward_partial_score/std": 0.2584453523159027, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0540456771850586, + "sampling/importance_sampling_ratio/min": 0.0003275219933129847, + "sampling/sampling_logp_difference/max": 8.023955345153809, + "sampling/sampling_logp_difference/mean": 0.10680307447910309, + "step": 117 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2550651431083679, + "epoch": 0.3105263157894737, + "grad_norm": 0.014003097079694271, + "learning_rate": 1e-06, + "loss": 0.0051, + "step": 118 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.25681254267692566, + "epoch": 0.3131578947368421, + "grad_norm": 0.030175212770700455, + "learning_rate": 1e-06, + "loss": 0.0557, + "step": 119 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.25778651237487793, + "epoch": 0.3157894736842105, + "grad_norm": 0.020599860697984695, + "learning_rate": 1e-06, + "loss": 0.01, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11032.0, + "completions/mean_length": 2128.177734375, + "completions/mean_terminated_length": 1786.0380859375, + "completions/min_length": 348.0, + "completions/min_terminated_length": 348.0, + "entropy": 0.26357313990592957, + "epoch": 0.31842105263157894, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.032891228795051575, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 49218685.0, + "reward": 0.7386098504066467, + "reward_std": 0.24482640624046326, + "rewards/progression_diversity/mean": -0.0013221381232142448, + "rewards/progression_diversity/std": 0.017824513837695122, + "rewards/symbolic_reward_accuracy/mean": 0.798828125, + "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, + "rewards/symbolic_reward_partial_score/mean": 0.87158203125, + "rewards/symbolic_reward_partial_score/std": 0.29892396926879883, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.052980661392212, + "sampling/importance_sampling_ratio/min": 0.006169522181153297, + "sampling/sampling_logp_difference/max": 5.088133811950684, + "sampling/sampling_logp_difference/mean": 0.10582087934017181, + "step": 121 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.24658922851085663, + "epoch": 0.32105263157894737, + "grad_norm": 0.024154705926775932, + "learning_rate": 1e-06, + "loss": 0.0446, + "step": 122 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.25428229570388794, + "epoch": 0.3236842105263158, + "grad_norm": 0.03276718035340309, + "learning_rate": 1e-06, + "loss": 0.0973, + "step": 123 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2513224929571152, + "epoch": 0.3263157894736842, + "grad_norm": 0.027816835790872574, + "learning_rate": 1e-06, + "loss": 0.0737, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10545.0, + "completions/mean_length": 2134.193359375, + "completions/mean_terminated_length": 1704.11865234375, + "completions/min_length": 341.0, + "completions/min_terminated_length": 341.0, + "entropy": 0.24834615737199783, + "epoch": 0.32894736842105265, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.02625676989555359, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 50700704.0, + "reward": 0.7628857493400574, + "reward_std": 0.21416853368282318, + "rewards/progression_diversity/mean": -0.000490223930682987, + "rewards/progression_diversity/std": 0.006874173413962126, + "rewards/symbolic_reward_accuracy/mean": 0.828125, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.8951822519302368, + "rewards/symbolic_reward_partial_score/std": 0.26842740178108215, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0525569915771484, + "sampling/importance_sampling_ratio/min": 0.0012693606549873948, + "sampling/sampling_logp_difference/max": 6.669241905212402, + "sampling/sampling_logp_difference/mean": 0.10405848920345306, + "step": 125 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2593626528978348, + "epoch": 0.33157894736842103, + "grad_norm": 0.028445186093449593, + "learning_rate": 1e-06, + "loss": 0.0248, + "step": 126 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.25416844338178635, + "epoch": 0.33421052631578946, + "grad_norm": 0.02981475181877613, + "learning_rate": 1e-06, + "loss": 0.0686, + "step": 127 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2521153539419174, + "epoch": 0.3368421052631579, + "grad_norm": 0.03827949985861778, + "learning_rate": 1e-06, + "loss": 0.089, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11825.0, + "completions/mean_length": 2242.904296875, + "completions/mean_terminated_length": 1668.06298828125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.2524839788675308, + "epoch": 0.3394736842105263, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0338742621243, + "learning_rate": 1e-06, + "loss": 0.0749, + "num_tokens": 52248207.0, + "reward": 0.7660582065582275, + "reward_std": 0.2173374444246292, + "rewards/progression_diversity/mean": -0.0006320271058939397, + "rewards/progression_diversity/std": 0.007326250895857811, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.8966470956802368, + "rewards/symbolic_reward_partial_score/std": 0.2703816890716553, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0492503643035889, + "sampling/importance_sampling_ratio/min": 0.0005629548104479909, + "sampling/sampling_logp_difference/max": 7.482311248779297, + "sampling/sampling_logp_difference/mean": 0.09761972725391388, + "step": 129 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.24737834185361862, + "epoch": 0.34210526315789475, + "grad_norm": 0.027043595910072327, + "learning_rate": 1e-06, + "loss": 0.0991, + "step": 130 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.23644034564495087, + "epoch": 0.3447368421052632, + "grad_norm": 0.02878217026591301, + "learning_rate": 1e-06, + "loss": 0.0564, + "step": 131 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.24825213849544525, + "epoch": 0.3473684210526316, + "grad_norm": 0.03835158050060272, + "learning_rate": 1e-06, + "loss": 0.0377, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15272.0, + "completions/mean_length": 1937.564453125, + "completions/mean_terminated_length": 1620.377197265625, + "completions/min_length": 346.0, + "completions/min_terminated_length": 346.0, + "entropy": 0.24670010060071945, + "epoch": 0.35, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.022889573127031326, + "learning_rate": 1e-06, + "loss": 0.0667, + "num_tokens": 53642704.0, + "reward": 0.8256836533546448, + "reward_std": 0.16099657118320465, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9384765625, + "rewards/symbolic_reward_partial_score/std": 0.2168099284172058, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0539462566375732, + "sampling/importance_sampling_ratio/min": 8.059991523623466e-05, + "sampling/sampling_logp_difference/max": 9.426012992858887, + "sampling/sampling_logp_difference/mean": 0.1068400889635086, + "step": 133 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.25666534900665283, + "epoch": 0.3526315789473684, + "grad_norm": 0.019193602725863457, + "learning_rate": 1e-06, + "loss": 0.0584, + "step": 134 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.25828083604574203, + "epoch": 0.35526315789473684, + "grad_norm": 0.025437770411372185, + "learning_rate": 1e-06, + "loss": 0.0151, + "step": 135 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2562948167324066, + "epoch": 0.35789473684210527, + "grad_norm": 0.026910221204161644, + "learning_rate": 1e-06, + "loss": 0.047, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13533.0, + "completions/mean_length": 2228.55859375, + "completions/mean_terminated_length": 1771.931396484375, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.2502548471093178, + "epoch": 0.3605263157894737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.0259707048535347, + "learning_rate": 1e-06, + "loss": 0.0639, + "num_tokens": 55173518.0, + "reward": 0.7800472974777222, + "reward_std": 0.17662927508354187, + "rewards/progression_diversity/mean": -0.00308664096519351, + "rewards/progression_diversity/std": 0.02825590781867504, + "rewards/symbolic_reward_accuracy/mean": 0.849609375, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.9095051884651184, + "rewards/symbolic_reward_partial_score/std": 0.25963231921195984, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0506607294082642, + "sampling/importance_sampling_ratio/min": 0.00025443226331844926, + "sampling/sampling_logp_difference/max": 8.27647590637207, + "sampling/sampling_logp_difference/mean": 0.10103225708007812, + "step": 137 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.24615327268838882, + "epoch": 0.3631578947368421, + "grad_norm": 0.03407861292362213, + "learning_rate": 1e-06, + "loss": 0.0456, + "step": 138 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.24503520876169205, + "epoch": 0.36578947368421055, + "grad_norm": 0.020230667665600777, + "learning_rate": 1e-06, + "loss": 0.052, + "step": 139 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.2436116561293602, + "epoch": 0.3684210526315789, + "grad_norm": 0.033303920179605484, + "learning_rate": 1e-06, + "loss": 0.0703, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13365.0, + "completions/mean_length": 1732.8828125, + "completions/mean_terminated_length": 1500.325439453125, + "completions/min_length": 370.0, + "completions/min_terminated_length": 370.0, + "entropy": 0.25387296825647354, + "epoch": 0.37105263157894736, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.02063675969839096, + "learning_rate": 1e-06, + "loss": -0.0168, + "num_tokens": 56458706.0, + "reward": 0.8216304779052734, + "reward_std": 0.14440101385116577, + "rewards/progression_diversity/mean": -4.0891380194807425e-05, + "rewards/progression_diversity/std": 0.0009252663003280759, + "rewards/symbolic_reward_accuracy/mean": 0.900390625, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.9431965947151184, + "rewards/symbolic_reward_partial_score/std": 0.1989695280790329, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0523567199707031, + "sampling/importance_sampling_ratio/min": 1.1014159326805384e-06, + "sampling/sampling_logp_difference/max": 13.718914031982422, + "sampling/sampling_logp_difference/mean": 0.10537827014923096, + "step": 141 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.24923139810562134, + "epoch": 0.3736842105263158, + "grad_norm": 0.02487753890454769, + "learning_rate": 1e-06, + "loss": 0.0127, + "step": 142 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.248472198843956, + "epoch": 0.3763157894736842, + "grad_norm": 0.017968177795410156, + "learning_rate": 1e-06, + "loss": 0.0407, + "step": 143 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.24632388353347778, + "epoch": 0.37894736842105264, + "grad_norm": 0.025321682915091515, + "learning_rate": 1e-06, + "loss": 0.0718, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13227.0, + "completions/mean_length": 2052.263671875, + "completions/mean_terminated_length": 1708.3021240234375, + "completions/min_length": 320.0, + "completions/min_terminated_length": 320.0, + "entropy": 0.2430901676416397, + "epoch": 0.3815789473684211, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.042146675288677216, + "learning_rate": 1e-06, + "loss": 0.069, + "num_tokens": 57902585.0, + "reward": 0.7910541296005249, + "reward_std": 0.1890793740749359, + "rewards/progression_diversity/mean": -0.0010371711105108261, + "rewards/progression_diversity/std": 0.013147015124559402, + "rewards/symbolic_reward_accuracy/mean": 0.86328125, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.9181314706802368, + "rewards/symbolic_reward_partial_score/std": 0.24215349555015564, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0506622791290283, + "sampling/importance_sampling_ratio/min": 3.2121545423535736e-09, + "sampling/sampling_logp_difference/max": 19.556324005126953, + "sampling/sampling_logp_difference/mean": 0.10158580541610718, + "step": 145 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.24240544438362122, + "epoch": 0.38421052631578945, + "grad_norm": 0.023529332131147385, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 146 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2464822679758072, + "epoch": 0.3868421052631579, + "grad_norm": 0.020634565502405167, + "learning_rate": 1e-06, + "loss": 0.0987, + "step": 147 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2451770007610321, + "epoch": 0.3894736842105263, + "grad_norm": 0.030732842162251472, + "learning_rate": 1e-06, + "loss": 0.0308, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9631.0, + "completions/mean_length": 2138.572265625, + "completions/mean_terminated_length": 1679.042236328125, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.2397768646478653, + "epoch": 0.39210526315789473, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.02241184562444687, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 59407358.0, + "reward": 0.7700119018554688, + "reward_std": 0.21146252751350403, + "rewards/progression_diversity/mean": -0.0007633853820152581, + "rewards/progression_diversity/std": 0.009358874522149563, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.9007161855697632, + "rewards/symbolic_reward_partial_score/std": 0.2675977051258087, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0503795146942139, + "sampling/importance_sampling_ratio/min": 1.3879385960535728e-06, + "sampling/sampling_logp_difference/max": 13.487690925598145, + "sampling/sampling_logp_difference/mean": 0.10065832734107971, + "step": 149 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.24372172355651855, + "epoch": 0.39473684210526316, + "grad_norm": 0.034816596657037735, + "learning_rate": 1e-06, + "loss": 0.0347, + "step": 150 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2407890260219574, + "epoch": 0.3973684210526316, + "grad_norm": 0.021488770842552185, + "learning_rate": 1e-06, + "loss": 0.0215, + "step": 151 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2366773709654808, + "epoch": 0.4, + "grad_norm": 0.02249423786997795, + "learning_rate": 1e-06, + "loss": 0.0609, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13010.0, + "completions/mean_length": 2161.275390625, + "completions/mean_terminated_length": 1672.8182373046875, + "completions/min_length": 338.0, + "completions/min_terminated_length": 338.0, + "entropy": 0.23152073472738266, + "epoch": 0.4026315789473684, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.03908664733171463, + "learning_rate": 1e-06, + "loss": 0.0989, + "num_tokens": 60915915.0, + "reward": 0.7834299206733704, + "reward_std": 0.21882712841033936, + "rewards/progression_diversity/mean": -0.0017377887852489948, + "rewards/progression_diversity/std": 0.021848157048225403, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.90185546875, + "rewards/symbolic_reward_partial_score/std": 0.2732160687446594, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0461015701293945, + "sampling/importance_sampling_ratio/min": 5.239376719146094e-07, + "sampling/sampling_logp_difference/max": 14.461893081665039, + "sampling/sampling_logp_difference/mean": 0.09288465231657028, + "step": 153 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2292015701532364, + "epoch": 0.4052631578947368, + "grad_norm": 0.030572297051548958, + "learning_rate": 1e-06, + "loss": 0.1011, + "step": 154 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.23775313049554825, + "epoch": 0.40789473684210525, + "grad_norm": 0.028754258528351784, + "learning_rate": 1e-06, + "loss": 0.0394, + "step": 155 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.24100537598133087, + "epoch": 0.4105263157894737, + "grad_norm": 0.021660154685378075, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13572.0, + "completions/mean_length": 1708.77734375, + "completions/mean_terminated_length": 1534.762939453125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.24858221411705017, + "epoch": 0.4131578947368421, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.022633863613009453, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 62199769.0, + "reward": 0.8241158723831177, + "reward_std": 0.1481999158859253, + "rewards/progression_diversity/mean": -0.0005272195558063686, + "rewards/progression_diversity/std": 0.010804719291627407, + "rewards/symbolic_reward_accuracy/mean": 0.90625, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.9384765625, + "rewards/symbolic_reward_partial_score/std": 0.21358926594257355, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0517460107803345, + "sampling/importance_sampling_ratio/min": 0.0012036137050017715, + "sampling/sampling_logp_difference/max": 6.722426891326904, + "sampling/sampling_logp_difference/mean": 0.10469282418489456, + "step": 157 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2418229877948761, + "epoch": 0.41578947368421054, + "grad_norm": 0.024082506075501442, + "learning_rate": 1e-06, + "loss": 0.0616, + "step": 158 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.24460408091545105, + "epoch": 0.41842105263157897, + "grad_norm": 0.0060430532321333885, + "learning_rate": 1e-06, + "loss": 0.0383, + "step": 159 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.23893345147371292, + "epoch": 0.42105263157894735, + "grad_norm": 0.014973205514252186, + "learning_rate": 1e-06, + "loss": 0.0374, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12721.0, + "completions/mean_length": 2082.78125, + "completions/mean_terminated_length": 1768.782470703125, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.23413384705781937, + "epoch": 0.4236842105263158, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.02440241537988186, + "learning_rate": 1e-06, + "loss": 0.0183, + "num_tokens": 63668137.0, + "reward": 0.8117564916610718, + "reward_std": 0.18439695239067078, + "rewards/progression_diversity/mean": -0.0011134764645248652, + "rewards/progression_diversity/std": 0.014692641794681549, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.9278970956802368, + "rewards/symbolic_reward_partial_score/std": 0.23557829856872559, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.048673152923584, + "sampling/importance_sampling_ratio/min": 0.0006473406101576984, + "sampling/sampling_logp_difference/max": 7.34263801574707, + "sampling/sampling_logp_difference/mean": 0.09804990142583847, + "step": 161 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2300802245736122, + "epoch": 0.4263157894736842, + "grad_norm": 0.02143077924847603, + "learning_rate": 1e-06, + "loss": 0.0697, + "step": 162 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.24065203219652176, + "epoch": 0.42894736842105263, + "grad_norm": 0.02908634953200817, + "learning_rate": 1e-06, + "loss": 0.0755, + "step": 163 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.24057897180318832, + "epoch": 0.43157894736842106, + "grad_norm": 0.04149603471159935, + "learning_rate": 1e-06, + "loss": 0.0485, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11887.0, + "completions/mean_length": 2101.234375, + "completions/mean_terminated_length": 1845.6778564453125, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.23806512355804443, + "epoch": 0.4342105263157895, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.03864562138915062, + "learning_rate": 1e-06, + "loss": 0.0451, + "num_tokens": 65144289.0, + "reward": 0.7944764494895935, + "reward_std": 0.19753678143024445, + "rewards/progression_diversity/mean": -0.000595143239479512, + "rewards/progression_diversity/std": 0.010785636492073536, + "rewards/symbolic_reward_accuracy/mean": 0.8671875, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.91845703125, + "rewards/symbolic_reward_partial_score/std": 0.24648991227149963, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.049810767173767, + "sampling/importance_sampling_ratio/min": 1.5343101040343754e-05, + "sampling/sampling_logp_difference/max": 11.084844589233398, + "sampling/sampling_logp_difference/mean": 0.10091371834278107, + "step": 165 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.23936307430267334, + "epoch": 0.4368421052631579, + "grad_norm": 0.02704034186899662, + "learning_rate": 1e-06, + "loss": 0.0388, + "step": 166 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2366686388850212, + "epoch": 0.4394736842105263, + "grad_norm": 0.016262901946902275, + "learning_rate": 1e-06, + "loss": 0.0106, + "step": 167 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.22937704622745514, + "epoch": 0.4421052631578947, + "grad_norm": 0.013757162727415562, + "learning_rate": 1e-06, + "loss": 0.0324, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11079.0, + "completions/mean_length": 1793.64453125, + "completions/mean_terminated_length": 1443.47607421875, + "completions/min_length": 332.0, + "completions/min_terminated_length": 332.0, + "entropy": 0.23620514571666718, + "epoch": 0.44473684210526315, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.018099864944815636, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 66446891.0, + "reward": 0.8357792496681213, + "reward_std": 0.14379340410232544, + "rewards/progression_diversity/mean": -0.0011789561249315739, + "rewards/progression_diversity/std": 0.015437182039022446, + "rewards/symbolic_reward_accuracy/mean": 0.923828125, + "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, + "rewards/symbolic_reward_partial_score/mean": 0.94482421875, + "rewards/symbolic_reward_partial_score/std": 0.2143392264842987, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.049529790878296, + "sampling/importance_sampling_ratio/min": 0.0015639930497854948, + "sampling/sampling_logp_difference/max": 6.460513114929199, + "sampling/sampling_logp_difference/mean": 0.10102123022079468, + "step": 169 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.24607934057712555, + "epoch": 0.4473684210526316, + "grad_norm": 0.014026135206222534, + "learning_rate": 1e-06, + "loss": 0.0313, + "step": 170 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.23312748968601227, + "epoch": 0.45, + "grad_norm": 0.03011462651193142, + "learning_rate": 1e-06, + "loss": 0.0888, + "step": 171 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.23896265774965286, + "epoch": 0.45263157894736844, + "grad_norm": 0.009486875496804714, + "learning_rate": 1e-06, + "loss": 0.0374, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10726.0, + "completions/mean_length": 1911.1640625, + "completions/mean_terminated_length": 1652.2066650390625, + "completions/min_length": 335.0, + "completions/min_terminated_length": 335.0, + "entropy": 0.23735451698303223, + "epoch": 0.45526315789473687, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.0280209518969059, + "learning_rate": 1e-06, + "loss": 0.0433, + "num_tokens": 67804351.0, + "reward": 0.8081526756286621, + "reward_std": 0.16803589463233948, + "rewards/progression_diversity/mean": -0.00016519335622433573, + "rewards/progression_diversity/std": 0.003562908386811614, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9249674081802368, + "rewards/symbolic_reward_partial_score/std": 0.2344280630350113, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0501618385314941, + "sampling/importance_sampling_ratio/min": 0.0007048218976706266, + "sampling/sampling_logp_difference/max": 7.257565498352051, + "sampling/sampling_logp_difference/mean": 0.10282571613788605, + "step": 173 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2369169294834137, + "epoch": 0.45789473684210524, + "grad_norm": 0.026422906666994095, + "learning_rate": 1e-06, + "loss": 0.0448, + "step": 174 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2406521737575531, + "epoch": 0.4605263157894737, + "grad_norm": 0.028511840850114822, + "learning_rate": 1e-06, + "loss": 0.0538, + "step": 175 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.2376062050461769, + "epoch": 0.4631578947368421, + "grad_norm": 0.029201552271842957, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13314.0, + "completions/mean_length": 1935.361328125, + "completions/mean_terminated_length": 1647.5399169921875, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.23343181610107422, + "epoch": 0.46578947368421053, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.036340776830911636, + "learning_rate": 1e-06, + "loss": 0.0636, + "num_tokens": 69200088.0, + "reward": 0.8155167102813721, + "reward_std": 0.1823035329580307, + "rewards/progression_diversity/mean": -0.0010712584480643272, + "rewards/progression_diversity/std": 0.016564983874559402, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.9358723759651184, + "rewards/symbolic_reward_partial_score/std": 0.22121170163154602, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0475256443023682, + "sampling/importance_sampling_ratio/min": 1.0533493565237362e-12, + "sampling/sampling_logp_difference/max": 27.57904624938965, + "sampling/sampling_logp_difference/mean": 0.09736086428165436, + "step": 177 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.22760003805160522, + "epoch": 0.46842105263157896, + "grad_norm": 0.034887488931417465, + "learning_rate": 1e-06, + "loss": 0.0751, + "step": 178 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.22835474461317062, + "epoch": 0.4710526315789474, + "grad_norm": 0.02664666809141636, + "learning_rate": 1e-06, + "loss": 0.0702, + "step": 179 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.23141715675592422, + "epoch": 0.47368421052631576, + "grad_norm": 0.02732260897755623, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13435.0, + "completions/mean_length": 2063.080078125, + "completions/mean_terminated_length": 1748.648681640625, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 0.22150472551584244, + "epoch": 0.4763157894736842, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.048945918679237366, + "learning_rate": 1e-06, + "loss": 0.0967, + "num_tokens": 70669697.0, + "reward": 0.7893031239509583, + "reward_std": 0.20550626516342163, + "rewards/progression_diversity/mean": -0.00035783840576186776, + "rewards/progression_diversity/std": 0.008096958510577679, + "rewards/symbolic_reward_accuracy/mean": 0.8671875, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.90380859375, + "rewards/symbolic_reward_partial_score/std": 0.27613478899002075, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0472935438156128, + "sampling/importance_sampling_ratio/min": 0.004802103620022535, + "sampling/sampling_logp_difference/max": 5.338701248168945, + "sampling/sampling_logp_difference/mean": 0.09636872261762619, + "step": 181 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.22790049761533737, + "epoch": 0.4789473684210526, + "grad_norm": 0.021670255810022354, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 182 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.22305145859718323, + "epoch": 0.48157894736842105, + "grad_norm": 0.012241641990840435, + "learning_rate": 1e-06, + "loss": 0.0197, + "step": 183 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.21923468261957169, + "epoch": 0.4842105263157895, + "grad_norm": 0.029154837131500244, + "learning_rate": 1e-06, + "loss": 0.074, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10743.0, + "completions/mean_length": 1856.03515625, + "completions/mean_terminated_length": 1507.364013671875, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.2165418118238449, + "epoch": 0.4868421052631579, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.04361012578010559, + "learning_rate": 1e-06, + "loss": 0.1, + "num_tokens": 72034643.0, + "reward": 0.8248018026351929, + "reward_std": 0.15697109699249268, + "rewards/progression_diversity/mean": -0.0002908821334131062, + "rewards/progression_diversity/std": 0.00464981934055686, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9518228769302368, + "rewards/symbolic_reward_partial_score/std": 0.18324565887451172, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0449192523956299, + "sampling/importance_sampling_ratio/min": 0.000814249215181917, + "sampling/sampling_logp_difference/max": 7.11324405670166, + "sampling/sampling_logp_difference/mean": 0.09324932098388672, + "step": 185 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.22823286801576614, + "epoch": 0.48947368421052634, + "grad_norm": 0.015330553986132145, + "learning_rate": 1e-06, + "loss": 0.0087, + "step": 186 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.22463513165712357, + "epoch": 0.4921052631578947, + "grad_norm": 0.028646033257246017, + "learning_rate": 1e-06, + "loss": 0.0249, + "step": 187 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.21679264307022095, + "epoch": 0.49473684210526314, + "grad_norm": 0.027475610375404358, + "learning_rate": 1e-06, + "loss": 0.1208, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14325.0, + "completions/mean_length": 1838.8671875, + "completions/mean_terminated_length": 1489.7840576171875, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.2217376008629799, + "epoch": 0.49736842105263157, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.03827635198831558, + "learning_rate": 1e-06, + "loss": 0.0414, + "num_tokens": 73378767.0, + "reward": 0.8262168169021606, + "reward_std": 0.1503533273935318, + "rewards/progression_diversity/mean": -0.00038728650542907417, + "rewards/progression_diversity/std": 0.0051419991068542, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.94091796875, + "rewards/symbolic_reward_partial_score/std": 0.215196430683136, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0445411205291748, + "sampling/importance_sampling_ratio/min": 8.583670023654122e-07, + "sampling/sampling_logp_difference/max": 13.968234062194824, + "sampling/sampling_logp_difference/mean": 0.09397557377815247, + "step": 189 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.21815990656614304, + "epoch": 0.5, + "grad_norm": 0.027534402906894684, + "learning_rate": 1e-06, + "loss": 0.0625, + "step": 190 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.21029751747846603, + "epoch": 0.5026315789473684, + "grad_norm": 0.028749065473675728, + "learning_rate": 1e-06, + "loss": 0.0977, + "step": 191 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.21950938552618027, + "epoch": 0.5052631578947369, + "grad_norm": 0.02661977894604206, + "learning_rate": 1e-06, + "loss": 0.0369, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12097.0, + "completions/mean_length": 2130.484375, + "completions/mean_terminated_length": 1817.532958984375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "entropy": 0.2030806839466095, + "epoch": 0.5078947368421053, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.025007417425513268, + "learning_rate": 1e-06, + "loss": 0.0233, + "num_tokens": 74868871.0, + "reward": 0.8149902820587158, + "reward_std": 0.18850557506084442, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.900390625, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.92236328125, + "rewards/symbolic_reward_partial_score/std": 0.2531226873397827, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0420916080474854, + "sampling/importance_sampling_ratio/min": 5.281509857013589e-06, + "sampling/sampling_logp_difference/max": 12.151298522949219, + "sampling/sampling_logp_difference/mean": 0.08910438418388367, + "step": 193 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.20039298385381699, + "epoch": 0.5105263157894737, + "grad_norm": 0.019791144877672195, + "learning_rate": 1e-06, + "loss": 0.0733, + "step": 194 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.20205791294574738, + "epoch": 0.5131578947368421, + "grad_norm": 0.03766762465238571, + "learning_rate": 1e-06, + "loss": 0.054, + "step": 195 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.19714680314064026, + "epoch": 0.5157894736842106, + "grad_norm": 0.04690009355545044, + "learning_rate": 1e-06, + "loss": 0.1071, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11150.0, + "completions/mean_length": 1767.6796875, + "completions/mean_terminated_length": 1386.893798828125, + "completions/min_length": 319.0, + "completions/min_terminated_length": 319.0, + "entropy": 0.19719091057777405, + "epoch": 0.5184210526315789, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.031063755974173546, + "learning_rate": 1e-06, + "loss": 0.0426, + "num_tokens": 76177731.0, + "reward": 0.823818027973175, + "reward_std": 0.16009867191314697, + "rewards/progression_diversity/mean": -0.0010085422545671463, + "rewards/progression_diversity/std": 0.010762260295450687, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.9459635019302368, + "rewards/symbolic_reward_partial_score/std": 0.19425591826438904, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0391511917114258, + "sampling/importance_sampling_ratio/min": 0.0001364499912597239, + "sampling/sampling_logp_difference/max": 8.899552345275879, + "sampling/sampling_logp_difference/mean": 0.08513116836547852, + "step": 197 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.19633030891418457, + "epoch": 0.5210526315789473, + "grad_norm": 0.018081026151776314, + "learning_rate": 1e-06, + "loss": 0.0274, + "step": 198 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.19438430666923523, + "epoch": 0.5236842105263158, + "grad_norm": 0.03003780171275139, + "learning_rate": 1e-06, + "loss": 0.0837, + "step": 199 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.1886511892080307, + "epoch": 0.5263157894736842, + "grad_norm": 0.032905541360378265, + "learning_rate": 1e-06, + "loss": 0.0643, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16023.0, + "completions/mean_length": 2301.400390625, + "completions/mean_terminated_length": 1728.9368896484375, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.18248793482780457, + "epoch": 0.5289473684210526, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.04558183625340462, + "learning_rate": 1e-06, + "loss": 0.1227, + "num_tokens": 77773552.0, + "reward": 0.8007115721702576, + "reward_std": 0.21243008971214294, + "rewards/progression_diversity/mean": -0.0020881860982626677, + "rewards/progression_diversity/std": 0.021307511255145073, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.9197590947151184, + "rewards/symbolic_reward_partial_score/std": 0.2490549385547638, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0355485677719116, + "sampling/importance_sampling_ratio/min": 1.0011056567060805e-10, + "sampling/sampling_logp_difference/max": 23.02474594116211, + "sampling/sampling_logp_difference/mean": 0.07838210463523865, + "step": 201 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.17451690137386322, + "epoch": 0.531578947368421, + "grad_norm": 0.04084772244095802, + "learning_rate": 1e-06, + "loss": 0.1141, + "step": 202 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.17165183275938034, + "epoch": 0.5342105263157895, + "grad_norm": 0.031080789864063263, + "learning_rate": 1e-06, + "loss": 0.0879, + "step": 203 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.16927115619182587, + "epoch": 0.5368421052631579, + "grad_norm": 0.016663571819663048, + "learning_rate": 1e-06, + "loss": 0.0779, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16068.0, + "completions/mean_length": 2071.619140625, + "completions/mean_terminated_length": 1550.115478515625, + "completions/min_length": 358.0, + "completions/min_terminated_length": 358.0, + "entropy": 0.16112020611763, + "epoch": 0.5394736842105263, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.018799329176545143, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 79247213.0, + "reward": 0.8091070055961609, + "reward_std": 0.1729590892791748, + "rewards/progression_diversity/mean": -0.002385494764894247, + "rewards/progression_diversity/std": 0.028513550758361816, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9347330331802368, + "rewards/symbolic_reward_partial_score/std": 0.21787308156490326, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0297162532806396, + "sampling/importance_sampling_ratio/min": 1.7676053403192782e-06, + "sampling/sampling_logp_difference/max": 13.245884895324707, + "sampling/sampling_logp_difference/mean": 0.0692635178565979, + "step": 205 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.15220321714878082, + "epoch": 0.5421052631578948, + "grad_norm": 0.037522438913583755, + "learning_rate": 1e-06, + "loss": 0.1625, + "step": 206 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.15934187173843384, + "epoch": 0.5447368421052632, + "grad_norm": 0.013017321936786175, + "learning_rate": 1e-06, + "loss": 0.0563, + "step": 207 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.15468831360340118, + "epoch": 0.5473684210526316, + "grad_norm": 0.009292000904679298, + "learning_rate": 1e-06, + "loss": 0.0932, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9757.0, + "completions/mean_length": 1568.158203125, + "completions/mean_terminated_length": 1212.5780029296875, + "completions/min_length": 291.0, + "completions/min_terminated_length": 291.0, + "entropy": 0.15928836166858673, + "epoch": 0.55, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.027132874354720116, + "learning_rate": 1e-06, + "loss": 0.0402, + "num_tokens": 80432382.0, + "reward": 0.8553647994995117, + "reward_std": 0.11348331719636917, + "rewards/progression_diversity/mean": -0.0006339521496556699, + "rewards/progression_diversity/std": 0.00838653463870287, + "rewards/symbolic_reward_accuracy/mean": 0.9453125, + "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, + "rewards/symbolic_reward_partial_score/mean": 0.9690755009651184, + "rewards/symbolic_reward_partial_score/std": 0.1494486927986145, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0306223630905151, + "sampling/importance_sampling_ratio/min": 7.626142905792221e-05, + "sampling/sampling_logp_difference/max": 9.481343269348145, + "sampling/sampling_logp_difference/mean": 0.07124973833560944, + "step": 209 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.15551680326461792, + "epoch": 0.5526315789473685, + "grad_norm": 0.018515659496188164, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 210 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.15634237229824066, + "epoch": 0.5552631578947368, + "grad_norm": 0.012999890372157097, + "learning_rate": 1e-06, + "loss": 0.0665, + "step": 211 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.15420294553041458, + "epoch": 0.5578947368421052, + "grad_norm": 0.021096454933285713, + "learning_rate": 1e-06, + "loss": 0.131, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15015.0, + "completions/mean_length": 2094.546875, + "completions/mean_terminated_length": 1422.44580078125, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "entropy": 0.1454745978116989, + "epoch": 0.5605263157894737, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.021531764417886734, + "learning_rate": 1e-06, + "loss": 0.0801, + "num_tokens": 81900598.0, + "reward": 0.7925652265548706, + "reward_std": 0.19417259097099304, + "rewards/progression_diversity/mean": -0.0012926449999213219, + "rewards/progression_diversity/std": 0.012499667704105377, + "rewards/symbolic_reward_accuracy/mean": 0.873046875, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.91015625, + "rewards/symbolic_reward_partial_score/std": 0.2634425461292267, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0279793739318848, + "sampling/importance_sampling_ratio/min": 3.422269249670623e-13, + "sampling/sampling_logp_difference/max": 28.70330238342285, + "sampling/sampling_logp_difference/mean": 0.06678377091884613, + "step": 213 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.14702380448579788, + "epoch": 0.5631578947368421, + "grad_norm": 0.020642530173063278, + "learning_rate": 1e-06, + "loss": 0.0657, + "step": 214 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.14872030168771744, + "epoch": 0.5657894736842105, + "grad_norm": 0.016585860401391983, + "learning_rate": 1e-06, + "loss": -0.0113, + "step": 215 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.14099013805389404, + "epoch": 0.5684210526315789, + "grad_norm": 0.03327646851539612, + "learning_rate": 1e-06, + "loss": 0.1437, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12443.0, + "completions/mean_length": 2076.515625, + "completions/mean_terminated_length": 1342.045166015625, + "completions/min_length": 343.0, + "completions/min_terminated_length": 343.0, + "entropy": 0.15217551589012146, + "epoch": 0.5710526315789474, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.030961882323026657, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 83365918.0, + "reward": 0.8457842469215393, + "reward_std": 0.14305046200752258, + "rewards/progression_diversity/mean": -0.0016604659613221884, + "rewards/progression_diversity/std": 0.020232077687978745, + "rewards/symbolic_reward_accuracy/mean": 0.935546875, + "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, + "rewards/symbolic_reward_partial_score/mean": 0.9638671875, + "rewards/symbolic_reward_partial_score/std": 0.15897713601589203, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.025794267654419, + "sampling/importance_sampling_ratio/min": 1.3557089005189482e-05, + "sampling/sampling_logp_difference/max": 11.208600997924805, + "sampling/sampling_logp_difference/mean": 0.06135455146431923, + "step": 217 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.13979589194059372, + "epoch": 0.5736842105263158, + "grad_norm": 0.030766695737838745, + "learning_rate": 1e-06, + "loss": 0.136, + "step": 218 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.1349509060382843, + "epoch": 0.5763157894736842, + "grad_norm": 0.024950722232460976, + "learning_rate": 1e-06, + "loss": 0.1487, + "step": 219 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.13936427235603333, + "epoch": 0.5789473684210527, + "grad_norm": 0.025016427040100098, + "learning_rate": 1e-06, + "loss": 0.1389, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9998.0, + "completions/mean_length": 2004.419921875, + "completions/mean_terminated_length": 1450.2373046875, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.13566848635673523, + "epoch": 0.5815789473684211, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.03230908513069153, + "learning_rate": 1e-06, + "loss": 0.0997, + "num_tokens": 84804853.0, + "reward": 0.8050163984298706, + "reward_std": 0.20277492702007294, + "rewards/progression_diversity/mean": -0.001296035130508244, + "rewards/progression_diversity/std": 0.020735615864396095, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9256185293197632, + "rewards/symbolic_reward_partial_score/std": 0.23007379472255707, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0255374908447266, + "sampling/importance_sampling_ratio/min": 0.00021752847533207387, + "sampling/sampling_logp_difference/max": 8.433180809020996, + "sampling/sampling_logp_difference/mean": 0.06265204399824142, + "step": 221 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.13476034253835678, + "epoch": 0.5842105263157895, + "grad_norm": 0.038641829043626785, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 222 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.13380540907382965, + "epoch": 0.5868421052631579, + "grad_norm": 0.02986939251422882, + "learning_rate": 1e-06, + "loss": 0.0707, + "step": 223 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.13185346126556396, + "epoch": 0.5894736842105263, + "grad_norm": 0.04221319779753685, + "learning_rate": 1e-06, + "loss": 0.0909, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11142.0, + "completions/mean_length": 1651.677734375, + "completions/mean_terminated_length": 1207.0401611328125, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.13264429569244385, + "epoch": 0.5921052631578947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.03359711542725563, + "learning_rate": 1e-06, + "loss": 0.0813, + "num_tokens": 86012272.0, + "reward": 0.8497519493103027, + "reward_std": 0.12195061147212982, + "rewards/progression_diversity/mean": -0.0003971385594923049, + "rewards/progression_diversity/std": 0.007108698599040508, + "rewards/symbolic_reward_accuracy/mean": 0.943359375, + "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, + "rewards/symbolic_reward_partial_score/mean": 0.95556640625, + "rewards/symbolic_reward_partial_score/std": 0.19622980058193207, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.024213194847107, + "sampling/importance_sampling_ratio/min": 1.59300361701753e-05, + "sampling/sampling_logp_difference/max": 11.047304153442383, + "sampling/sampling_logp_difference/mean": 0.06262955069541931, + "step": 225 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.13049791753292084, + "epoch": 0.5947368421052631, + "grad_norm": 0.013412282802164555, + "learning_rate": 1e-06, + "loss": 0.0659, + "step": 226 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.12776117026805878, + "epoch": 0.5973684210526315, + "grad_norm": 0.023844925686717033, + "learning_rate": 1e-06, + "loss": 0.044, + "step": 227 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.12381385266780853, + "epoch": 0.6, + "grad_norm": 0.028511611744761467, + "learning_rate": 1e-06, + "loss": 0.0867, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15721.0, + "completions/mean_length": 1463.47265625, + "completions/mean_terminated_length": 1196.5048828125, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "entropy": 0.1172075942158699, + "epoch": 0.6026315789473684, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04271606728434563, + "learning_rate": 1e-06, + "loss": 0.0551, + "num_tokens": 87146690.0, + "reward": 0.8489208221435547, + "reward_std": 0.12613338232040405, + "rewards/progression_diversity/mean": -0.0005037329392507672, + "rewards/progression_diversity/std": 0.008694959804415703, + "rewards/symbolic_reward_accuracy/mean": 0.9375, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.9606119394302368, + "rewards/symbolic_reward_partial_score/std": 0.17724183201789856, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0227758884429932, + "sampling/importance_sampling_ratio/min": 8.189291838789359e-05, + "sampling/sampling_logp_difference/max": 9.4100980758667, + "sampling/sampling_logp_difference/mean": 0.05999467894434929, + "step": 229 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.11710315942764282, + "epoch": 0.6052631578947368, + "grad_norm": 0.008632275275886059, + "learning_rate": 1e-06, + "loss": 0.0071, + "step": 230 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.11399340257048607, + "epoch": 0.6078947368421053, + "grad_norm": 0.012347784824669361, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 231 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.11190313473343849, + "epoch": 0.6105263157894737, + "grad_norm": 0.022991470992565155, + "learning_rate": 1e-06, + "loss": 0.0425, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14312.0, + "completions/mean_length": 1782.529296875, + "completions/mean_terminated_length": 1311.5140380859375, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "entropy": 0.10688180476427078, + "epoch": 0.6131578947368421, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.03337698429822922, + "learning_rate": 1e-06, + "loss": 0.0355, + "num_tokens": 88459825.0, + "reward": 0.7974966168403625, + "reward_std": 0.17888271808624268, + "rewards/progression_diversity/mean": -0.00131894217338413, + "rewards/progression_diversity/std": 0.017751460894942284, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.9187825322151184, + "rewards/symbolic_reward_partial_score/std": 0.24961021542549133, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0183002948760986, + "sampling/importance_sampling_ratio/min": 5.948247780906968e-05, + "sampling/sampling_logp_difference/max": 9.729828834533691, + "sampling/sampling_logp_difference/mean": 0.05178453028202057, + "step": 233 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.10239839181303978, + "epoch": 0.6157894736842106, + "grad_norm": 0.019929470494389534, + "learning_rate": 1e-06, + "loss": 0.0941, + "step": 234 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.09998245164752007, + "epoch": 0.618421052631579, + "grad_norm": 0.027180753648281097, + "learning_rate": 1e-06, + "loss": 0.0925, + "step": 235 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.10064789652824402, + "epoch": 0.6210526315789474, + "grad_norm": 0.032762862741947174, + "learning_rate": 1e-06, + "loss": 0.0622, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15458.0, + "completions/mean_length": 1760.564453125, + "completions/mean_terminated_length": 1227.726806640625, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "entropy": 0.09621577709913254, + "epoch": 0.6236842105263158, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.03691167011857033, + "learning_rate": 1e-06, + "loss": 0.1223, + "num_tokens": 89768210.0, + "reward": 0.8165925741195679, + "reward_std": 0.1556319147348404, + "rewards/progression_diversity/mean": -0.0008989507332444191, + "rewards/progression_diversity/std": 0.016222195699810982, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9283853769302368, + "rewards/symbolic_reward_partial_score/std": 0.23884880542755127, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0182127952575684, + "sampling/importance_sampling_ratio/min": 0.0003808625042438507, + "sampling/sampling_logp_difference/max": 7.873072147369385, + "sampling/sampling_logp_difference/mean": 0.052828144282102585, + "step": 237 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.10177255794405937, + "epoch": 0.6263157894736842, + "grad_norm": 0.01514112763106823, + "learning_rate": 1e-06, + "loss": 0.0734, + "step": 238 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.10082132369279861, + "epoch": 0.6289473684210526, + "grad_norm": 0.02351570315659046, + "learning_rate": 1e-06, + "loss": 0.0691, + "step": 239 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.10559947416186333, + "epoch": 0.631578947368421, + "grad_norm": 0.012419568374752998, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8674.0, + "completions/mean_length": 1312.580078125, + "completions/mean_terminated_length": 1073.3511962890625, + "completions/min_length": 345.0, + "completions/min_terminated_length": 345.0, + "entropy": 0.10397671908140182, + "epoch": 0.6342105263157894, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03270323947072029, + "learning_rate": 1e-06, + "loss": 0.036, + "num_tokens": 90821019.0, + "reward": 0.8449134826660156, + "reward_std": 0.10425379872322083, + "rewards/progression_diversity/mean": -0.0008408930152654648, + "rewards/progression_diversity/std": 0.01656174287199974, + "rewards/symbolic_reward_accuracy/mean": 0.9296875, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.9615885615348816, + "rewards/symbolic_reward_partial_score/std": 0.16416993737220764, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0181472301483154, + "sampling/importance_sampling_ratio/min": 2.46100570477914e-11, + "sampling/sampling_logp_difference/max": 24.427865982055664, + "sampling/sampling_logp_difference/mean": 0.055217523127794266, + "step": 241 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.10521255061030388, + "epoch": 0.6368421052631579, + "grad_norm": 0.021176166832447052, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 242 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.10355697944760323, + "epoch": 0.6394736842105263, + "grad_norm": 0.015480482950806618, + "learning_rate": 1e-06, + "loss": 0.0404, + "step": 243 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.10299444571137428, + "epoch": 0.6421052631578947, + "grad_norm": 0.046614497900009155, + "learning_rate": 1e-06, + "loss": 0.0686, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14336.0, + "completions/mean_length": 1815.578125, + "completions/mean_terminated_length": 1254.11767578125, + "completions/min_length": 307.0, + "completions/min_terminated_length": 307.0, + "entropy": 0.0950668603181839, + "epoch": 0.6447368421052632, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.03163347393274307, + "learning_rate": 1e-06, + "loss": 0.0472, + "num_tokens": 92168259.0, + "reward": 0.7833421230316162, + "reward_std": 0.1979527473449707, + "rewards/progression_diversity/mean": -0.0007511397707276046, + "rewards/progression_diversity/std": 0.010229557752609253, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.91259765625, + "rewards/symbolic_reward_partial_score/std": 0.2514844834804535, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0157463550567627, + "sampling/importance_sampling_ratio/min": 8.007385726704896e-11, + "sampling/sampling_logp_difference/max": 23.248071670532227, + "sampling/sampling_logp_difference/mean": 0.04827887937426567, + "step": 245 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.09082784876227379, + "epoch": 0.6473684210526316, + "grad_norm": 0.03819683939218521, + "learning_rate": 1e-06, + "loss": 0.0754, + "step": 246 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.09024357795715332, + "epoch": 0.65, + "grad_norm": 0.03529435768723488, + "learning_rate": 1e-06, + "loss": 0.1095, + "step": 247 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.09114789962768555, + "epoch": 0.6526315789473685, + "grad_norm": 0.014682702720165253, + "learning_rate": 1e-06, + "loss": 0.0538, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11281.0, + "completions/mean_length": 1623.357421875, + "completions/mean_terminated_length": 1147.2076416015625, + "completions/min_length": 378.0, + "completions/min_terminated_length": 378.0, + "entropy": 0.08885756507515907, + "epoch": 0.6552631578947369, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.04838551953434944, + "learning_rate": 1e-06, + "loss": 0.1366, + "num_tokens": 93418746.0, + "reward": 0.826103687286377, + "reward_std": 0.17412379384040833, + "rewards/progression_diversity/mean": -0.0019385741325095296, + "rewards/progression_diversity/std": 0.022392842918634415, + "rewards/symbolic_reward_accuracy/mean": 0.912109375, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.93994140625, + "rewards/symbolic_reward_partial_score/std": 0.2181263417005539, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0159519910812378, + "sampling/importance_sampling_ratio/min": 4.792552772414638e-06, + "sampling/sampling_logp_difference/max": 12.24844741821289, + "sampling/sampling_logp_difference/mean": 0.049355216324329376, + "step": 249 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.09172747284173965, + "epoch": 0.6578947368421053, + "grad_norm": 0.020697645843029022, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 250 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.09275993704795837, + "epoch": 0.6605263157894737, + "grad_norm": 0.02590845711529255, + "learning_rate": 1e-06, + "loss": 0.0501, + "step": 251 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.09400615096092224, + "epoch": 0.6631578947368421, + "grad_norm": 0.03462434932589531, + "learning_rate": 1e-06, + "loss": 0.0493, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8934.0, + "completions/mean_length": 1809.392578125, + "completions/mean_terminated_length": 1278.333984375, + "completions/min_length": 340.0, + "completions/min_terminated_length": 340.0, + "entropy": 0.09200675785541534, + "epoch": 0.6657894736842105, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.04529382660984993, + "learning_rate": 1e-06, + "loss": 0.0245, + "num_tokens": 94753315.0, + "reward": 0.8259365558624268, + "reward_std": 0.13518205285072327, + "rewards/progression_diversity/mean": -0.004001125227659941, + "rewards/progression_diversity/std": 0.04852549359202385, + "rewards/symbolic_reward_accuracy/mean": 0.916015625, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.9329427480697632, + "rewards/symbolic_reward_partial_score/std": 0.23484937846660614, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0148303508758545, + "sampling/importance_sampling_ratio/min": 1.1414035583356963e-07, + "sampling/sampling_logp_difference/max": 15.98583698272705, + "sampling/sampling_logp_difference/mean": 0.04799918830394745, + "step": 253 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.08636122196912766, + "epoch": 0.6684210526315789, + "grad_norm": 0.039753351360559464, + "learning_rate": 1e-06, + "loss": 0.1328, + "step": 254 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.09036806598305702, + "epoch": 0.6710526315789473, + "grad_norm": 0.02018708921968937, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 255 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.08820921182632446, + "epoch": 0.6736842105263158, + "grad_norm": 0.03188847377896309, + "learning_rate": 1e-06, + "loss": 0.0965, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13585.0, + "completions/mean_length": 1527.927734375, + "completions/mean_terminated_length": 1171.382080078125, + "completions/min_length": 321.0, + "completions/min_terminated_length": 321.0, + "entropy": 0.09194277971982956, + "epoch": 0.6763157894736842, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.04443329945206642, + "learning_rate": 1e-06, + "loss": 0.064, + "num_tokens": 95937918.0, + "reward": 0.8315865397453308, + "reward_std": 0.13644886016845703, + "rewards/progression_diversity/mean": -0.0005308896070346236, + "rewards/progression_diversity/std": 0.0070388540625572205, + "rewards/symbolic_reward_accuracy/mean": 0.9140625, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.95166015625, + "rewards/symbolic_reward_partial_score/std": 0.1864015907049179, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.015270471572876, + "sampling/importance_sampling_ratio/min": 1.5392264726326823e-19, + "sampling/sampling_logp_difference/max": 43.31783676147461, + "sampling/sampling_logp_difference/mean": 0.04934335872530937, + "step": 257 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.08927204459905624, + "epoch": 0.6789473684210526, + "grad_norm": 0.030259989202022552, + "learning_rate": 1e-06, + "loss": 0.0617, + "step": 258 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.0892256572842598, + "epoch": 0.6815789473684211, + "grad_norm": 0.03653491288423538, + "learning_rate": 1e-06, + "loss": 0.0888, + "step": 259 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.08825241401791573, + "epoch": 0.6842105263157895, + "grad_norm": 0.02339651621878147, + "learning_rate": 1e-06, + "loss": 0.046, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10342.0, + "completions/mean_length": 1722.841796875, + "completions/mean_terminated_length": 1188.629638671875, + "completions/min_length": 365.0, + "completions/min_terminated_length": 365.0, + "entropy": 0.09076549112796783, + "epoch": 0.6868421052631579, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.021180791780352592, + "learning_rate": 1e-06, + "loss": 0.0357, + "num_tokens": 97201805.0, + "reward": 0.8398886919021606, + "reward_std": 0.13938692212104797, + "rewards/progression_diversity/mean": -0.00038591912016272545, + "rewards/progression_diversity/std": 0.005922461394220591, + "rewards/symbolic_reward_accuracy/mean": 0.9296875, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.9519857168197632, + "rewards/symbolic_reward_partial_score/std": 0.19745849072933197, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0142853260040283, + "sampling/importance_sampling_ratio/min": 2.9707528881317558e-08, + "sampling/sampling_logp_difference/max": 17.331865310668945, + "sampling/sampling_logp_difference/mean": 0.04651745408773422, + "step": 261 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.09219764545559883, + "epoch": 0.6894736842105263, + "grad_norm": 0.03137506544589996, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 262 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.09567607566714287, + "epoch": 0.6921052631578948, + "grad_norm": 0.022778861224651337, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 263 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.0844137892127037, + "epoch": 0.6947368421052632, + "grad_norm": 0.03902408108115196, + "learning_rate": 1e-06, + "loss": 0.1838, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11656.0, + "completions/mean_length": 1918.83203125, + "completions/mean_terminated_length": 1269.37548828125, + "completions/min_length": 381.0, + "completions/min_terminated_length": 381.0, + "entropy": 0.09079957380890846, + "epoch": 0.6973684210526315, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.03956228867173195, + "learning_rate": 1e-06, + "loss": 0.0201, + "num_tokens": 98594391.0, + "reward": 0.7959408164024353, + "reward_std": 0.18128597736358643, + "rewards/progression_diversity/mean": -0.0006502005271613598, + "rewards/progression_diversity/std": 0.010507218539714813, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.91748046875, + "rewards/symbolic_reward_partial_score/std": 0.25184011459350586, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0138227939605713, + "sampling/importance_sampling_ratio/min": 2.1744256395450634e-10, + "sampling/sampling_logp_difference/max": 22.249086380004883, + "sampling/sampling_logp_difference/mean": 0.04669389873743057, + "step": 265 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.0856584906578064, + "epoch": 0.7, + "grad_norm": 0.04341353103518486, + "learning_rate": 1e-06, + "loss": 0.0789, + "step": 266 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.08574041724205017, + "epoch": 0.7026315789473684, + "grad_norm": 0.04117586463689804, + "learning_rate": 1e-06, + "loss": 0.1279, + "step": 267 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.08641528338193893, + "epoch": 0.7052631578947368, + "grad_norm": 0.023934362456202507, + "learning_rate": 1e-06, + "loss": 0.0486, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6950.0, + "completions/mean_length": 1519.431640625, + "completions/mean_terminated_length": 1039.929443359375, + "completions/min_length": 328.0, + "completions/min_terminated_length": 328.0, + "entropy": 0.09421858936548233, + "epoch": 0.7078947368421052, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.021700512617826462, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 99741620.0, + "reward": 0.8472155332565308, + "reward_std": 0.12852367758750916, + "rewards/progression_diversity/mean": -0.0001352034305455163, + "rewards/progression_diversity/std": 0.0023650997318327427, + "rewards/symbolic_reward_accuracy/mean": 0.939453125, + "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, + "rewards/symbolic_reward_partial_score/mean": 0.95556640625, + "rewards/symbolic_reward_partial_score/std": 0.19245369732379913, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.014095664024353, + "sampling/importance_sampling_ratio/min": 0.0012474657269194722, + "sampling/sampling_logp_difference/max": 6.686641216278076, + "sampling/sampling_logp_difference/mean": 0.047991082072257996, + "step": 269 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.08904735371470451, + "epoch": 0.7105263157894737, + "grad_norm": 0.020569216459989548, + "learning_rate": 1e-06, + "loss": 0.0653, + "step": 270 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.08358699828386307, + "epoch": 0.7131578947368421, + "grad_norm": 0.036929428577423096, + "learning_rate": 1e-06, + "loss": 0.164, + "step": 271 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.08975838124752045, + "epoch": 0.7157894736842105, + "grad_norm": 0.01153595745563507, + "learning_rate": 1e-06, + "loss": 0.0975, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11530.0, + "completions/mean_length": 1798.42578125, + "completions/mean_terminated_length": 1205.5162353515625, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.08779791370034218, + "epoch": 0.718421052631579, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.051671918481588364, + "learning_rate": 1e-06, + "loss": 0.0716, + "num_tokens": 101081902.0, + "reward": 0.8015538454055786, + "reward_std": 0.1920056939125061, + "rewards/progression_diversity/mean": -0.0008653616532683372, + "rewards/progression_diversity/std": 0.011233367957174778, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9192708134651184, + "rewards/symbolic_reward_partial_score/std": 0.24985045194625854, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.012866497039795, + "sampling/importance_sampling_ratio/min": 5.919900445405801e-07, + "sampling/sampling_logp_difference/max": 14.339776039123535, + "sampling/sampling_logp_difference/mean": 0.047424331307411194, + "step": 273 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.08645644038915634, + "epoch": 0.7210526315789474, + "grad_norm": 0.04649584740400314, + "learning_rate": 1e-06, + "loss": 0.0667, + "step": 274 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.08046365156769753, + "epoch": 0.7236842105263158, + "grad_norm": 0.044324759393930435, + "learning_rate": 1e-06, + "loss": 0.1699, + "step": 275 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.08621568232774734, + "epoch": 0.7263157894736842, + "grad_norm": 0.033357467502355576, + "learning_rate": 1e-06, + "loss": 0.0483, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11624.0, + "completions/mean_length": 1809.94140625, + "completions/mean_terminated_length": 1309.418212890625, + "completions/min_length": 373.0, + "completions/min_terminated_length": 373.0, + "entropy": 0.08192634955048561, + "epoch": 0.7289473684210527, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.04480903223156929, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 102443600.0, + "reward": 0.7796283960342407, + "reward_std": 0.20830506086349487, + "rewards/progression_diversity/mean": -0.001035432331264019, + "rewards/progression_diversity/std": 0.010857795365154743, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.9021809697151184, + "rewards/symbolic_reward_partial_score/std": 0.27368080615997314, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0123987197875977, + "sampling/importance_sampling_ratio/min": 1.9622171695829784e-17, + "sampling/sampling_logp_difference/max": 38.469871520996094, + "sampling/sampling_logp_difference/mean": 0.04502936080098152, + "step": 277 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.08124225959181786, + "epoch": 0.7315789473684211, + "grad_norm": 0.03922456502914429, + "learning_rate": 1e-06, + "loss": 0.0856, + "step": 278 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.08088827505707741, + "epoch": 0.7342105263157894, + "grad_norm": 0.029460720717906952, + "learning_rate": 1e-06, + "loss": 0.0527, + "step": 279 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.07565705850720406, + "epoch": 0.7368421052631579, + "grad_norm": 0.03428329899907112, + "learning_rate": 1e-06, + "loss": 0.1428, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6622.0, + "completions/mean_length": 1865.5, + "completions/mean_terminated_length": 1213.64892578125, + "completions/min_length": 366.0, + "completions/min_terminated_length": 366.0, + "entropy": 0.08282288908958435, + "epoch": 0.7394736842105263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.03280191496014595, + "learning_rate": 1e-06, + "loss": 0.048, + "num_tokens": 103822896.0, + "reward": 0.799896240234375, + "reward_std": 0.18378372490406036, + "rewards/progression_diversity/mean": -0.0006127399392426014, + "rewards/progression_diversity/std": 0.0077356030233204365, + "rewards/symbolic_reward_accuracy/mean": 0.876953125, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.9267578125, + "rewards/symbolic_reward_partial_score/std": 0.23403851687908173, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0123448371887207, + "sampling/importance_sampling_ratio/min": 1.5536484170297626e-06, + "sampling/sampling_logp_difference/max": 13.37490463256836, + "sampling/sampling_logp_difference/mean": 0.047334231436252594, + "step": 281 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.08162574470043182, + "epoch": 0.7421052631578947, + "grad_norm": 0.03995591774582863, + "learning_rate": 1e-06, + "loss": 0.1125, + "step": 282 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.08600019663572311, + "epoch": 0.7447368421052631, + "grad_norm": 0.02127450704574585, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 283 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.08134080097079277, + "epoch": 0.7473684210526316, + "grad_norm": 0.020339855924248695, + "learning_rate": 1e-06, + "loss": 0.1129, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16203.0, + "completions/mean_length": 2301.19921875, + "completions/mean_terminated_length": 1362.345947265625, + "completions/min_length": 351.0, + "completions/min_terminated_length": 351.0, + "entropy": 0.07727199792861938, + "epoch": 0.75, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.059844452887773514, + "learning_rate": 1e-06, + "loss": 0.1395, + "num_tokens": 105415446.0, + "reward": 0.7762081623077393, + "reward_std": 0.252108633518219, + "rewards/progression_diversity/mean": -0.001261794357560575, + "rewards/progression_diversity/std": 0.012555805034935474, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.8855794072151184, + "rewards/symbolic_reward_partial_score/std": 0.30380791425704956, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0110630989074707, + "sampling/importance_sampling_ratio/min": 1.2527775652415585e-05, + "sampling/sampling_logp_difference/max": 11.287562370300293, + "sampling/sampling_logp_difference/mean": 0.04290057718753815, + "step": 285 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.07825002819299698, + "epoch": 0.7526315789473684, + "grad_norm": 0.03701354190707207, + "learning_rate": 1e-06, + "loss": 0.0698, + "step": 286 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.07666192576289177, + "epoch": 0.7552631578947369, + "grad_norm": 0.03503553569316864, + "learning_rate": 1e-06, + "loss": 0.1734, + "step": 287 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.07939735427498817, + "epoch": 0.7578947368421053, + "grad_norm": 0.03646966814994812, + "learning_rate": 1e-06, + "loss": 0.0965, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12705.0, + "completions/mean_length": 2036.3125, + "completions/mean_terminated_length": 1392.1304931640625, + "completions/min_length": 357.0, + "completions/min_terminated_length": 357.0, + "entropy": 0.07943989709019661, + "epoch": 0.7605263157894737, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.054009877145290375, + "learning_rate": 1e-06, + "loss": 0.0678, + "num_tokens": 106871190.0, + "reward": 0.76430344581604, + "reward_std": 0.23938891291618347, + "rewards/progression_diversity/mean": -0.0003246946434956044, + "rewards/progression_diversity/std": 0.005887071136385202, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.88232421875, + "rewards/symbolic_reward_partial_score/std": 0.3022898733615875, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0122061967849731, + "sampling/importance_sampling_ratio/min": 4.789482318301452e-06, + "sampling/sampling_logp_difference/max": 12.249088287353516, + "sampling/sampling_logp_difference/mean": 0.046712085604667664, + "step": 289 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.08157060295343399, + "epoch": 0.7631578947368421, + "grad_norm": 0.04651428759098053, + "learning_rate": 1e-06, + "loss": 0.0802, + "step": 290 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.07789277285337448, + "epoch": 0.7657894736842106, + "grad_norm": 0.026174401864409447, + "learning_rate": 1e-06, + "loss": 0.1247, + "step": 291 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.08365096524357796, + "epoch": 0.7684210526315789, + "grad_norm": 0.03342805802822113, + "learning_rate": 1e-06, + "loss": 0.0482, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12578.0, + "completions/mean_length": 1690.671875, + "completions/mean_terminated_length": 1368.0638427734375, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.08383786678314209, + "epoch": 0.7710526315789473, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.051632702350616455, + "learning_rate": 1e-06, + "loss": 0.1011, + "num_tokens": 108138638.0, + "reward": 0.8156664371490479, + "reward_std": 0.16709403693675995, + "rewards/progression_diversity/mean": -0.0007459928747266531, + "rewards/progression_diversity/std": 0.013311400078237057, + "rewards/symbolic_reward_accuracy/mean": 0.896484375, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.93310546875, + "rewards/symbolic_reward_partial_score/std": 0.22900764644145966, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.013049602508545, + "sampling/importance_sampling_ratio/min": 8.142708793457132e-06, + "sampling/sampling_logp_difference/max": 11.718387603759766, + "sampling/sampling_logp_difference/mean": 0.050904612988233566, + "step": 293 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.0861295573413372, + "epoch": 0.7736842105263158, + "grad_norm": 0.021426010876893997, + "learning_rate": 1e-06, + "loss": 0.0375, + "step": 294 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.08208976686000824, + "epoch": 0.7763157894736842, + "grad_norm": 0.042882923036813736, + "learning_rate": 1e-06, + "loss": 0.0493, + "step": 295 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.08664394915103912, + "epoch": 0.7789473684210526, + "grad_norm": 0.023627059534192085, + "learning_rate": 1e-06, + "loss": 0.024, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11644.0, + "completions/mean_length": 1897.166015625, + "completions/mean_terminated_length": 1338.849853515625, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "entropy": 0.08488818258047104, + "epoch": 0.781578947368421, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.07350046932697296, + "learning_rate": 1e-06, + "loss": 0.098, + "num_tokens": 109518627.0, + "reward": 0.8159432411193848, + "reward_std": 0.1821536421775818, + "rewards/progression_diversity/mean": -0.0023595020174980164, + "rewards/progression_diversity/std": 0.03643770143389702, + "rewards/symbolic_reward_accuracy/mean": 0.90625, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.9197590947151184, + "rewards/symbolic_reward_partial_score/std": 0.26337599754333496, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0125732421875, + "sampling/importance_sampling_ratio/min": 1.475510165438454e-11, + "sampling/sampling_logp_difference/max": 24.93943214416504, + "sampling/sampling_logp_difference/mean": 0.04736366495490074, + "step": 297 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.07962662726640701, + "epoch": 0.7842105263157895, + "grad_norm": 0.038738593459129333, + "learning_rate": 1e-06, + "loss": 0.1205, + "step": 298 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.0848800577223301, + "epoch": 0.7868421052631579, + "grad_norm": 0.015012623742222786, + "learning_rate": 1e-06, + "loss": 0.0471, + "step": 299 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.08305321261286736, + "epoch": 0.7894736842105263, + "grad_norm": 0.014111812226474285, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14519.0, + "completions/mean_length": 1872.60546875, + "completions/mean_terminated_length": 1404.4959716796875, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "entropy": 0.08086536824703217, + "epoch": 0.7921052631578948, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.05553090572357178, + "learning_rate": 1e-06, + "loss": 0.0673, + "num_tokens": 110901241.0, + "reward": 0.8010022640228271, + "reward_std": 0.2108319252729416, + "rewards/progression_diversity/mean": -0.002319404622539878, + "rewards/progression_diversity/std": 0.034735988825559616, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9109700918197632, + "rewards/symbolic_reward_partial_score/std": 0.26965585350990295, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0123342275619507, + "sampling/importance_sampling_ratio/min": 5.6792261458925644e-14, + "sampling/sampling_logp_difference/max": 30.49937629699707, + "sampling/sampling_logp_difference/mean": 0.04701223969459534, + "step": 301 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.08132592588663101, + "epoch": 0.7947368421052632, + "grad_norm": 0.05231938883662224, + "learning_rate": 1e-06, + "loss": 0.0842, + "step": 302 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.0790596604347229, + "epoch": 0.7973684210526316, + "grad_norm": 0.014687119983136654, + "learning_rate": 1e-06, + "loss": 0.1099, + "step": 303 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.08050628378987312, + "epoch": 0.8, + "grad_norm": 0.027776561677455902, + "learning_rate": 1e-06, + "loss": 0.071, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14410.0, + "completions/mean_length": 1649.3046875, + "completions/mean_terminated_length": 1235.0762939453125, + "completions/min_length": 349.0, + "completions/min_terminated_length": 349.0, + "entropy": 0.07759291678667068, + "epoch": 0.8026315789473685, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.05399545282125473, + "learning_rate": 1e-06, + "loss": 0.1019, + "num_tokens": 112154165.0, + "reward": 0.8109291791915894, + "reward_std": 0.18250510096549988, + "rewards/progression_diversity/mean": -0.0008340342901647091, + "rewards/progression_diversity/std": 0.014236577786505222, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.9231770634651184, + "rewards/symbolic_reward_partial_score/std": 0.25037682056427, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0121451616287231, + "sampling/importance_sampling_ratio/min": 2.1538072954964387e-18, + "sampling/sampling_logp_difference/max": 40.67929458618164, + "sampling/sampling_logp_difference/mean": 0.04661684110760689, + "step": 305 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.08159153163433075, + "epoch": 0.8052631578947368, + "grad_norm": 0.047688040882349014, + "learning_rate": 1e-06, + "loss": 0.0852, + "step": 306 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.08121361956000328, + "epoch": 0.8078947368421052, + "grad_norm": 0.017503326758742332, + "learning_rate": 1e-06, + "loss": 0.0402, + "step": 307 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.08150652050971985, + "epoch": 0.8105263157894737, + "grad_norm": 0.0493878610432148, + "learning_rate": 1e-06, + "loss": 0.0819, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9856.0, + "completions/mean_length": 1619.880859375, + "completions/mean_terminated_length": 1143.618896484375, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "entropy": 0.08610192313790321, + "epoch": 0.8131578947368421, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.05817464739084244, + "learning_rate": 1e-06, + "loss": 0.0506, + "num_tokens": 113374680.0, + "reward": 0.8267406821250916, + "reward_std": 0.16926893591880798, + "rewards/progression_diversity/mean": -0.0017180759459733963, + "rewards/progression_diversity/std": 0.03334691748023033, + "rewards/symbolic_reward_accuracy/mean": 0.9140625, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.9381510019302368, + "rewards/symbolic_reward_partial_score/std": 0.22628821432590485, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0124841928482056, + "sampling/importance_sampling_ratio/min": 7.453480520780431e-06, + "sampling/sampling_logp_difference/max": 11.806829452514648, + "sampling/sampling_logp_difference/mean": 0.047006309032440186, + "step": 309 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.08357677236199379, + "epoch": 0.8157894736842105, + "grad_norm": 0.032003868371248245, + "learning_rate": 1e-06, + "loss": 0.0531, + "step": 310 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.08344599604606628, + "epoch": 0.8184210526315789, + "grad_norm": 0.032237276434898376, + "learning_rate": 1e-06, + "loss": 0.0649, + "step": 311 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.08107833191752434, + "epoch": 0.8210526315789474, + "grad_norm": 0.03354015573859215, + "learning_rate": 1e-06, + "loss": 0.0988, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12625.0, + "completions/mean_length": 2280.06640625, + "completions/mean_terminated_length": 1308.396728515625, + "completions/min_length": 354.0, + "completions/min_terminated_length": 354.0, + "entropy": 0.07463454082608223, + "epoch": 0.8236842105263158, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0700058713555336, + "learning_rate": 1e-06, + "loss": 0.0735, + "num_tokens": 114952058.0, + "reward": 0.7640925645828247, + "reward_std": 0.19802439212799072, + "rewards/progression_diversity/mean": -0.0018792236223816872, + "rewards/progression_diversity/std": 0.02314453385770321, + "rewards/symbolic_reward_accuracy/mean": 0.841796875, + "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, + "rewards/symbolic_reward_partial_score/mean": 0.8849283456802368, + "rewards/symbolic_reward_partial_score/std": 0.2975020110607147, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.011326789855957, + "sampling/importance_sampling_ratio/min": 1.1499383845148259e-06, + "sampling/sampling_logp_difference/max": 13.675802230834961, + "sampling/sampling_logp_difference/mean": 0.04442109167575836, + "step": 313 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.0740601234138012, + "epoch": 0.8263157894736842, + "grad_norm": 0.03972548991441727, + "learning_rate": 1e-06, + "loss": 0.0904, + "step": 314 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.0788533091545105, + "epoch": 0.8289473684210527, + "grad_norm": 0.01969255320727825, + "learning_rate": 1e-06, + "loss": 0.0561, + "step": 315 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.0717739462852478, + "epoch": 0.8315789473684211, + "grad_norm": 0.019936300814151764, + "learning_rate": 1e-06, + "loss": 0.1043, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7844.0, + "completions/mean_length": 2019.439453125, + "completions/mean_terminated_length": 1188.4317626953125, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "entropy": 0.07409506663680077, + "epoch": 0.8342105263157895, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.05379168689250946, + "learning_rate": 1e-06, + "loss": 0.1369, + "num_tokens": 116408507.0, + "reward": 0.794788122177124, + "reward_std": 0.20335128903388977, + "rewards/progression_diversity/mean": -0.003609658218920231, + "rewards/progression_diversity/std": 0.039214227348566055, + "rewards/symbolic_reward_accuracy/mean": 0.873046875, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.9215494990348816, + "rewards/symbolic_reward_partial_score/std": 0.24459369480609894, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0107078552246094, + "sampling/importance_sampling_ratio/min": 6.796539331332951e-09, + "sampling/sampling_logp_difference/max": 18.806852340698242, + "sampling/sampling_logp_difference/mean": 0.0424201525747776, + "step": 317 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.07684982568025589, + "epoch": 0.8368421052631579, + "grad_norm": 0.02561601623892784, + "learning_rate": 1e-06, + "loss": 0.0562, + "step": 318 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.07624761387705803, + "epoch": 0.8394736842105263, + "grad_norm": 0.04118689149618149, + "learning_rate": 1e-06, + "loss": 0.0745, + "step": 319 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.07758089527487755, + "epoch": 0.8421052631578947, + "grad_norm": 0.02611648663878441, + "learning_rate": 1e-06, + "loss": 0.084, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14539.0, + "completions/mean_length": 1871.75, + "completions/mean_terminated_length": 1063.8515625, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.0785975344479084, + "epoch": 0.8447368421052631, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.01912563666701317, + "learning_rate": 1e-06, + "loss": 0.0434, + "num_tokens": 117755291.0, + "reward": 0.8071247339248657, + "reward_std": 0.1330493986606598, + "rewards/progression_diversity/mean": -0.005300430115312338, + "rewards/progression_diversity/std": 0.05674071982502937, + "rewards/symbolic_reward_accuracy/mean": 0.888671875, + "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, + "rewards/symbolic_reward_partial_score/mean": 0.9308267831802368, + "rewards/symbolic_reward_partial_score/std": 0.2333908975124359, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0106561183929443, + "sampling/importance_sampling_ratio/min": 9.975841486209447e-09, + "sampling/sampling_logp_difference/max": 18.423099517822266, + "sampling/sampling_logp_difference/mean": 0.045750319957733154, + "step": 321 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.07452228292822838, + "epoch": 0.8473684210526315, + "grad_norm": 0.028663959354162216, + "learning_rate": 1e-06, + "loss": 0.0673, + "step": 322 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.07840859517455101, + "epoch": 0.85, + "grad_norm": 0.01813848502933979, + "learning_rate": 1e-06, + "loss": 0.0938, + "step": 323 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.07799200713634491, + "epoch": 0.8526315789473684, + "grad_norm": 0.015808766707777977, + "learning_rate": 1e-06, + "loss": 0.0841, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15622.0, + "completions/mean_length": 1765.408203125, + "completions/mean_terminated_length": 1077.826171875, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.07237191870808601, + "epoch": 0.8552631578947368, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.05738206207752228, + "learning_rate": 1e-06, + "loss": 0.1679, + "num_tokens": 119072172.0, + "reward": 0.8257243633270264, + "reward_std": 0.1704496443271637, + "rewards/progression_diversity/mean": -0.0008076863596215844, + "rewards/progression_diversity/std": 0.008417648263275623, + "rewards/symbolic_reward_accuracy/mean": 0.91796875, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.9314778447151184, + "rewards/symbolic_reward_partial_score/std": 0.24522030353546143, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0103081464767456, + "sampling/importance_sampling_ratio/min": 7.582560135332983e-10, + "sampling/sampling_logp_difference/max": 21.0, + "sampling/sampling_logp_difference/mean": 0.04462500289082527, + "step": 325 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.07543528452515602, + "epoch": 0.8578947368421053, + "grad_norm": 0.022561442106962204, + "learning_rate": 1e-06, + "loss": 0.0468, + "step": 326 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.07968015596270561, + "epoch": 0.8605263157894737, + "grad_norm": 0.03309568762779236, + "learning_rate": 1e-06, + "loss": 0.0492, + "step": 327 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.08061303943395615, + "epoch": 0.8631578947368421, + "grad_norm": 0.017540300264954567, + "learning_rate": 1e-06, + "loss": 0.0731, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8829.0, + "completions/mean_length": 1729.509765625, + "completions/mean_terminated_length": 1133.7987060546875, + "completions/min_length": 327.0, + "completions/min_terminated_length": 327.0, + "entropy": 0.0855712741613388, + "epoch": 0.8657894736842106, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.04006322845816612, + "learning_rate": 1e-06, + "loss": 0.0258, + "num_tokens": 120356817.0, + "reward": 0.7963314652442932, + "reward_std": 0.1821439266204834, + "rewards/progression_diversity/mean": -0.0006411472568288445, + "rewards/progression_diversity/std": 0.013062160462141037, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.9181314706802368, + "rewards/symbolic_reward_partial_score/std": 0.24649207293987274, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.011404275894165, + "sampling/importance_sampling_ratio/min": 2.223772499476695e-09, + "sampling/sampling_logp_difference/max": 19.924060821533203, + "sampling/sampling_logp_difference/mean": 0.045835498720407486, + "step": 329 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.07828561961650848, + "epoch": 0.868421052631579, + "grad_norm": 0.032283343374729156, + "learning_rate": 1e-06, + "loss": 0.093, + "step": 330 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.08197357133030891, + "epoch": 0.8710526315789474, + "grad_norm": 0.04965886473655701, + "learning_rate": 1e-06, + "loss": 0.0956, + "step": 331 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.08033063635230064, + "epoch": 0.8736842105263158, + "grad_norm": 0.029129937291145325, + "learning_rate": 1e-06, + "loss": 0.0689, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9487.0, + "completions/mean_length": 1397.1328125, + "completions/mean_terminated_length": 1006.6934204101562, + "completions/min_length": 306.0, + "completions/min_terminated_length": 306.0, + "entropy": 0.08633046597242355, + "epoch": 0.8763157894736842, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.034233320504426956, + "learning_rate": 1e-06, + "loss": 0.0423, + "num_tokens": 121475957.0, + "reward": 0.8381311893463135, + "reward_std": 0.1662929654121399, + "rewards/progression_diversity/mean": -0.0003610485000535846, + "rewards/progression_diversity/std": 0.006844029296189547, + "rewards/symbolic_reward_accuracy/mean": 0.927734375, + "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, + "rewards/symbolic_reward_partial_score/mean": 0.94677734375, + "rewards/symbolic_reward_partial_score/std": 0.2108106017112732, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0122419595718384, + "sampling/importance_sampling_ratio/min": 3.6730430110765155e-06, + "sampling/sampling_logp_difference/max": 12.514490127563477, + "sampling/sampling_logp_difference/mean": 0.047133252024650574, + "step": 333 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.08650662377476692, + "epoch": 0.8789473684210526, + "grad_norm": 0.013222447596490383, + "learning_rate": 1e-06, + "loss": 0.0508, + "step": 334 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.08855986595153809, + "epoch": 0.881578947368421, + "grad_norm": 0.010891803540289402, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 335 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.0886671282351017, + "epoch": 0.8842105263157894, + "grad_norm": 0.014351065270602703, + "learning_rate": 1e-06, + "loss": 0.0698, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15569.0, + "completions/mean_length": 1693.0078125, + "completions/mean_terminated_length": 1064.67626953125, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.08585496246814728, + "epoch": 0.8868421052631579, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0539562962949276, + "learning_rate": 1e-06, + "loss": 0.1175, + "num_tokens": 122741049.0, + "reward": 0.8183258771896362, + "reward_std": 0.15719908475875854, + "rewards/progression_diversity/mean": -0.0033535552211105824, + "rewards/progression_diversity/std": 0.04218889772891998, + "rewards/symbolic_reward_accuracy/mean": 0.90625, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.9290364384651184, + "rewards/symbolic_reward_partial_score/std": 0.24505099654197693, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0127696990966797, + "sampling/importance_sampling_ratio/min": 7.183120487752603e-06, + "sampling/sampling_logp_difference/max": 11.84377670288086, + "sampling/sampling_logp_difference/mean": 0.045910563319921494, + "step": 337 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.08782363682985306, + "epoch": 0.8894736842105263, + "grad_norm": 0.026008745655417442, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 338 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.09032351523637772, + "epoch": 0.8921052631578947, + "grad_norm": 0.05031033605337143, + "learning_rate": 1e-06, + "loss": 0.1113, + "step": 339 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.0947401411831379, + "epoch": 0.8947368421052632, + "grad_norm": 0.020250199362635612, + "learning_rate": 1e-06, + "loss": 0.0466, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7096.0, + "completions/mean_length": 1569.30078125, + "completions/mean_terminated_length": 967.0772094726562, + "completions/min_length": 305.0, + "completions/min_terminated_length": 305.0, + "entropy": 0.09787581861019135, + "epoch": 0.8973684210526316, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.03366163745522499, + "learning_rate": 1e-06, + "loss": 0.0513, + "num_tokens": 123942163.0, + "reward": 0.8242777585983276, + "reward_std": 0.17191796004772186, + "rewards/progression_diversity/mean": -0.0038672885857522488, + "rewards/progression_diversity/std": 0.05004805698990822, + "rewards/symbolic_reward_accuracy/mean": 0.9140625, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.9326171875, + "rewards/symbolic_reward_partial_score/std": 0.23826108872890472, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0147292613983154, + "sampling/importance_sampling_ratio/min": 4.3840016587637365e-05, + "sampling/sampling_logp_difference/max": 10.034963607788086, + "sampling/sampling_logp_difference/mean": 0.05117640271782875, + "step": 341 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.09365754574537277, + "epoch": 0.9, + "grad_norm": 0.029882870614528656, + "learning_rate": 1e-06, + "loss": 0.0784, + "step": 342 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.09345110133290291, + "epoch": 0.9026315789473685, + "grad_norm": 0.03513422608375549, + "learning_rate": 1e-06, + "loss": 0.0559, + "step": 343 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.09143765270709991, + "epoch": 0.9052631578947369, + "grad_norm": 0.024734172970056534, + "learning_rate": 1e-06, + "loss": 0.0746, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6936.0, + "completions/mean_length": 1666.32421875, + "completions/mean_terminated_length": 1005.5305786132812, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "entropy": 0.09469415619969368, + "epoch": 0.9078947368421053, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.03798873722553253, + "learning_rate": 1e-06, + "loss": 0.0727, + "num_tokens": 125223993.0, + "reward": 0.810152530670166, + "reward_std": 0.19300349056720734, + "rewards/progression_diversity/mean": -0.00526084192097187, + "rewards/progression_diversity/std": 0.053911034017801285, + "rewards/symbolic_reward_accuracy/mean": 0.8984375, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.9181314706802368, + "rewards/symbolic_reward_partial_score/std": 0.26354485750198364, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0131090879440308, + "sampling/importance_sampling_ratio/min": 1.7146856407634914e-05, + "sampling/sampling_logp_difference/max": 10.973695755004883, + "sampling/sampling_logp_difference/mean": 0.04458559304475784, + "step": 345 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.08893976360559464, + "epoch": 0.9105263157894737, + "grad_norm": 0.03366508707404137, + "learning_rate": 1e-06, + "loss": 0.1022, + "step": 346 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.09565364941954613, + "epoch": 0.9131578947368421, + "grad_norm": 0.01510405819863081, + "learning_rate": 1e-06, + "loss": 0.0209, + "step": 347 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.09264839813113213, + "epoch": 0.9157894736842105, + "grad_norm": 0.015571820549666882, + "learning_rate": 1e-06, + "loss": 0.1362, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8325.0, + "completions/mean_length": 1417.078125, + "completions/mean_terminated_length": 934.274169921875, + "completions/min_length": 290.0, + "completions/min_terminated_length": 290.0, + "entropy": 0.09942467883229256, + "epoch": 0.9184210526315789, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.046768125146627426, + "learning_rate": 1e-06, + "loss": 0.0793, + "num_tokens": 126347329.0, + "reward": 0.8201934099197388, + "reward_std": 0.18495051562786102, + "rewards/progression_diversity/mean": -0.0021503996104002, + "rewards/progression_diversity/std": 0.03275972977280617, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.9280598759651184, + "rewards/symbolic_reward_partial_score/std": 0.24548624455928802, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0158005952835083, + "sampling/importance_sampling_ratio/min": 1.7636549500821275e-06, + "sampling/sampling_logp_difference/max": 13.248122215270996, + "sampling/sampling_logp_difference/mean": 0.05201897770166397, + "step": 349 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.09871027618646622, + "epoch": 0.9210526315789473, + "grad_norm": 0.015367010608315468, + "learning_rate": 1e-06, + "loss": 0.0738, + "step": 350 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.09856873750686646, + "epoch": 0.9236842105263158, + "grad_norm": 0.02123577892780304, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 351 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.09770620614290237, + "epoch": 0.9263157894736842, + "grad_norm": 0.02374398149549961, + "learning_rate": 1e-06, + "loss": 0.072, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12225.0, + "completions/mean_length": 2309.302734375, + "completions/mean_terminated_length": 1084.1168212890625, + "completions/min_length": 315.0, + "completions/min_terminated_length": 315.0, + "entropy": 0.09652888029813766, + "epoch": 0.9289473684210526, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02025141753256321, + "learning_rate": 1e-06, + "loss": 0.0611, + "num_tokens": 127938332.0, + "reward": 0.7528777122497559, + "reward_std": 0.21120837330818176, + "rewards/progression_diversity/mean": -0.0052050938829779625, + "rewards/progression_diversity/std": 0.045792821794748306, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.8645833134651184, + "rewards/symbolic_reward_partial_score/std": 0.32852864265441895, + "rewards/tag_count_reward/mean": -0.080078125, + "rewards/tag_count_reward/std": 0.271679550409317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0146894454956055, + "sampling/importance_sampling_ratio/min": 3.716075696047483e-07, + "sampling/sampling_logp_difference/max": 14.805427551269531, + "sampling/sampling_logp_difference/mean": 0.050460390746593475, + "step": 353 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.09619471430778503, + "epoch": 0.9315789473684211, + "grad_norm": 0.033115409314632416, + "learning_rate": 1e-06, + "loss": 0.0835, + "step": 354 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.09719686955213547, + "epoch": 0.9342105263157895, + "grad_norm": 0.03017679788172245, + "learning_rate": 1e-06, + "loss": 0.111, + "step": 355 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.09859620407223701, + "epoch": 0.9368421052631579, + "grad_norm": 0.02311846613883972, + "learning_rate": 1e-06, + "loss": 0.0881, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14940.0, + "completions/mean_length": 1620.341796875, + "completions/mean_terminated_length": 894.2601928710938, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.10532217100262642, + "epoch": 0.9394736842105263, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.028915587812662125, + "learning_rate": 1e-06, + "loss": 0.039, + "num_tokens": 129171915.0, + "reward": 0.8127701282501221, + "reward_std": 0.16265010833740234, + "rewards/progression_diversity/mean": -0.0022891198750585318, + "rewards/progression_diversity/std": 0.022422684356570244, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9202474355697632, + "rewards/symbolic_reward_partial_score/std": 0.2603859007358551, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0165646076202393, + "sampling/importance_sampling_ratio/min": 5.568129235709263e-17, + "sampling/sampling_logp_difference/max": 37.42688751220703, + "sampling/sampling_logp_difference/mean": 0.053267642855644226, + "step": 357 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.10380205139517784, + "epoch": 0.9421052631578948, + "grad_norm": 0.039858750998973846, + "learning_rate": 1e-06, + "loss": 0.1027, + "step": 358 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.10536672174930573, + "epoch": 0.9447368421052632, + "grad_norm": 0.011492653749883175, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 359 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.10100537538528442, + "epoch": 0.9473684210526315, + "grad_norm": 0.04140660539269447, + "learning_rate": 1e-06, + "loss": 0.0916, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5861.0, + "completions/mean_length": 1051.1953125, + "completions/mean_terminated_length": 869.3834228515625, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.11518026143312454, + "epoch": 0.95, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.029431862756609917, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 130096271.0, + "reward": 0.8326168060302734, + "reward_std": 0.1631331741809845, + "rewards/progression_diversity/mean": -4.066104520461522e-05, + "rewards/progression_diversity/std": 0.000920054386369884, + "rewards/symbolic_reward_accuracy/mean": 0.91796875, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.943359375, + "rewards/symbolic_reward_partial_score/std": 0.21264995634555817, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.019619345664978, + "sampling/importance_sampling_ratio/min": 5.807437264593318e-05, + "sampling/sampling_logp_difference/max": 9.753786087036133, + "sampling/sampling_logp_difference/mean": 0.06247701495885849, + "step": 361 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.11114299297332764, + "epoch": 0.9526315789473684, + "grad_norm": 0.045638490468263626, + "learning_rate": 1e-06, + "loss": 0.058, + "step": 362 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.10937144979834557, + "epoch": 0.9552631578947368, + "grad_norm": 0.013277344405651093, + "learning_rate": 1e-06, + "loss": 0.0497, + "step": 363 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.11187266558408737, + "epoch": 0.9578947368421052, + "grad_norm": 0.013971379958093166, + "learning_rate": 1e-06, + "loss": 0.0041, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6367.0, + "completions/mean_length": 1118.857421875, + "completions/mean_terminated_length": 845.7236328125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.10873980447649956, + "epoch": 0.9605263157894737, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.03028622269630432, + "learning_rate": 1e-06, + "loss": 0.0177, + "num_tokens": 131061606.0, + "reward": 0.8580078482627869, + "reward_std": 0.12159307301044464, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.951171875, + "rewards/symbolic_reward_accuracy/std": 0.2157193273305893, + "rewards/symbolic_reward_partial_score/mean": 0.9635416269302368, + "rewards/symbolic_reward_partial_score/std": 0.17610274255275726, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0192532539367676, + "sampling/importance_sampling_ratio/min": 3.5043937196554964e-10, + "sampling/sampling_logp_difference/max": 21.771833419799805, + "sampling/sampling_logp_difference/mean": 0.062029507011175156, + "step": 365 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.11343821510672569, + "epoch": 0.9631578947368421, + "grad_norm": 0.006393097806721926, + "learning_rate": 1e-06, + "loss": 0.0393, + "step": 366 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.10997330024838448, + "epoch": 0.9657894736842105, + "grad_norm": 0.04175656661391258, + "learning_rate": 1e-06, + "loss": 0.0229, + "step": 367 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.11030839011073112, + "epoch": 0.968421052631579, + "grad_norm": 0.009800048545002937, + "learning_rate": 1e-06, + "loss": 0.0279, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8872.0, + "completions/mean_length": 2057.54296875, + "completions/mean_terminated_length": 941.5873413085938, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.10226080939173698, + "epoch": 0.9710526315789474, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.0458550862967968, + "learning_rate": 1e-06, + "loss": 0.1033, + "num_tokens": 132521884.0, + "reward": 0.7442119121551514, + "reward_std": 0.21518045663833618, + "rewards/progression_diversity/mean": -0.0026392091531306505, + "rewards/progression_diversity/std": 0.02361941523849964, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.8590494394302368, + "rewards/symbolic_reward_partial_score/std": 0.33246058225631714, + "rewards/tag_count_reward/mean": -0.068359375, + "rewards/tag_count_reward/std": 0.25260838866233826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0173622369766235, + "sampling/importance_sampling_ratio/min": 5.933624197496101e-05, + "sampling/sampling_logp_difference/max": 9.732290267944336, + "sampling/sampling_logp_difference/mean": 0.05440949648618698, + "step": 369 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.10158997401595116, + "epoch": 0.9736842105263158, + "grad_norm": 0.038810838013887405, + "learning_rate": 1e-06, + "loss": 0.0925, + "step": 370 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.10655439645051956, + "epoch": 0.9763157894736842, + "grad_norm": 0.019916830584406853, + "learning_rate": 1e-06, + "loss": 0.0455, + "step": 371 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.10742965340614319, + "epoch": 0.9789473684210527, + "grad_norm": 0.01620314083993435, + "learning_rate": 1e-06, + "loss": 0.0595, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7506.0, + "completions/mean_length": 2036.57421875, + "completions/mean_terminated_length": 820.690673828125, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.1056484505534172, + "epoch": 0.9815789473684211, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.028215283527970314, + "learning_rate": 1e-06, + "loss": 0.0996, + "num_tokens": 133962402.0, + "reward": 0.7769756317138672, + "reward_std": 0.14990827441215515, + "rewards/progression_diversity/mean": -0.002632810501381755, + "rewards/progression_diversity/std": 0.029632003977894783, + "rewards/symbolic_reward_accuracy/mean": 0.857421875, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.9005533456802368, + "rewards/symbolic_reward_partial_score/std": 0.28045758605003357, + "rewards/tag_count_reward/mean": -0.076171875, + "rewards/tag_count_reward/std": 0.26553234457969666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0167696475982666, + "sampling/importance_sampling_ratio/min": 2.0920060388868178e-14, + "sampling/sampling_logp_difference/max": 31.49806785583496, + "sampling/sampling_logp_difference/mean": 0.052999142557382584, + "step": 373 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.10324260219931602, + "epoch": 0.9842105263157894, + "grad_norm": 0.03451311215758324, + "learning_rate": 1e-06, + "loss": 0.0725, + "step": 374 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.10772082582116127, + "epoch": 0.9868421052631579, + "grad_norm": 0.01771300472319126, + "learning_rate": 1e-06, + "loss": 0.0278, + "step": 375 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.10192153975367546, + "epoch": 0.9894736842105263, + "grad_norm": 0.05016673728823662, + "learning_rate": 1e-06, + "loss": 0.1221, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14736.0, + "completions/mean_length": 1309.953125, + "completions/mean_terminated_length": 792.2586059570312, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "entropy": 0.11359592527151108, + "epoch": 0.9921052631578947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.02216515503823757, + "learning_rate": 1e-06, + "loss": 0.0448, + "num_tokens": 135040714.0, + "reward": 0.8058879971504211, + "reward_std": 0.18076883256435394, + "rewards/progression_diversity/mean": -0.0020200079306960106, + "rewards/progression_diversity/std": 0.026723647490143776, + "rewards/symbolic_reward_accuracy/mean": 0.890625, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.9161783456802368, + "rewards/symbolic_reward_partial_score/std": 0.2642694115638733, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0174283981323242, + "sampling/importance_sampling_ratio/min": 1.079955563909607e-05, + "sampling/sampling_logp_difference/max": 11.436005592346191, + "sampling/sampling_logp_difference/mean": 0.057007431983947754, + "step": 377 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.1086348332464695, + "epoch": 0.9947368421052631, + "grad_norm": 0.022858861833810806, + "learning_rate": 1e-06, + "loss": 0.0659, + "step": 378 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.11348432675004005, + "epoch": 0.9973684210526316, + "grad_norm": 0.01786152645945549, + "learning_rate": 1e-06, + "loss": 0.0745, + "step": 379 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.1098298691213131, + "epoch": 1.0, + "grad_norm": 0.014867125079035759, + "learning_rate": 1e-06, + "loss": 0.118, + "step": 380 + }, + { + "epoch": 1.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.015380859375, + "eval_completions/max_length": 13116.78125, + "eval_completions/max_terminated_length": 3658.3125, + "eval_completions/mean_length": 877.506103515625, + "eval_completions/mean_terminated_length": 635.8593416213989, + "eval_completions/min_length": 274.03125, + "eval_completions/min_terminated_length": 274.03125, + "eval_entropy": 0.11584724555723369, + "eval_frac_reward_zero_std": 0.50390625, + "eval_loss": 0.018385088071227074, + "eval_num_tokens": 135040714.0, + "eval_reward": 0.8390934336930513, + "eval_reward_std": 0.13885579153429717, + "eval_rewards/progression_diversity/mean": -0.0005718442766351473, + "eval_rewards/progression_diversity/std": 0.005817418514197925, + "eval_rewards/symbolic_reward_accuracy/mean": 0.928466796875, + "eval_rewards/symbolic_reward_accuracy/std": 0.24404766922816634, + "eval_rewards/symbolic_reward_partial_score/mean": 0.9451904278248549, + "eval_rewards/symbolic_reward_partial_score/std": 0.19986428710399196, + "eval_rewards/tag_count_reward/mean": -0.015380859375, + "eval_rewards/tag_count_reward/std": 0.10135847562924027, + "eval_runtime": 3216.5627, + "eval_samples_per_second": 0.078, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.020699668675661, + "eval_sampling/importance_sampling_ratio/min": 0.0016229211131763817, + "eval_sampling/sampling_logp_difference/max": 8.347711205482483, + "eval_sampling/sampling_logp_difference/mean": 0.06724433228373528, + "eval_steps_per_second": 0.001, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5424.0, + "completions/mean_length": 1000.6953125, + "completions/mean_terminated_length": 756.5159301757812, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.11362230032682419, + "epoch": 1.0026315789473683, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.04116159304976463, + "learning_rate": 1e-06, + "loss": 0.0408, + "num_tokens": 135975406.0, + "reward": 0.8353489637374878, + "reward_std": 0.15797469019889832, + "rewards/progression_diversity/mean": -0.0002593405661173165, + "rewards/progression_diversity/std": 0.005868207197636366, + "rewards/symbolic_reward_accuracy/mean": 0.921875, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.9459635019302368, + "rewards/symbolic_reward_partial_score/std": 0.20843014121055603, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0185670852661133, + "sampling/importance_sampling_ratio/min": 5.377536217565648e-05, + "sampling/sampling_logp_difference/max": 9.830695152282715, + "sampling/sampling_logp_difference/mean": 0.060036540031433105, + "step": 381 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.11580048501491547, + "epoch": 1.0052631578947369, + "grad_norm": 0.039675887674093246, + "learning_rate": 1e-06, + "loss": 0.0341, + "step": 382 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.11673459783196449, + "epoch": 1.0078947368421052, + "grad_norm": 0.022269433364272118, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 383 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.10907693952322006, + "epoch": 1.0105263157894737, + "grad_norm": 0.01744413562119007, + "learning_rate": 1e-06, + "loss": 0.0408, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7503.0, + "completions/mean_length": 1365.107421875, + "completions/mean_terminated_length": 817.8603515625, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.11244688928127289, + "epoch": 1.013157894736842, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.04786130413413048, + "learning_rate": 1e-06, + "loss": 0.0923, + "num_tokens": 137089157.0, + "reward": 0.7905688285827637, + "reward_std": 0.1710084080696106, + "rewards/progression_diversity/mean": -0.0007375068962574005, + "rewards/progression_diversity/std": 0.00985936913639307, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.92431640625, + "rewards/symbolic_reward_partial_score/std": 0.22582882642745972, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0192362070083618, + "sampling/importance_sampling_ratio/min": 2.148686326108873e-05, + "sampling/sampling_logp_difference/max": 10.748068809509277, + "sampling/sampling_logp_difference/mean": 0.05868230015039444, + "step": 385 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.11678306013345718, + "epoch": 1.0157894736842106, + "grad_norm": 0.017300132662057877, + "learning_rate": 1e-06, + "loss": 0.0402, + "step": 386 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.11940395832061768, + "epoch": 1.018421052631579, + "grad_norm": 0.01427427213639021, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 387 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.11587749421596527, + "epoch": 1.0210526315789474, + "grad_norm": 0.03753054514527321, + "learning_rate": 1e-06, + "loss": 0.0614, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10616.0, + "completions/mean_length": 1142.22265625, + "completions/mean_terminated_length": 745.1422729492188, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "entropy": 0.1159932017326355, + "epoch": 1.0236842105263158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.03958291932940483, + "learning_rate": 1e-06, + "loss": 0.0994, + "num_tokens": 138065431.0, + "reward": 0.8293313384056091, + "reward_std": 0.15930168330669403, + "rewards/progression_diversity/mean": -0.001442349050194025, + "rewards/progression_diversity/std": 0.016170360147953033, + "rewards/symbolic_reward_accuracy/mean": 0.916015625, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.94091796875, + "rewards/symbolic_reward_partial_score/std": 0.2196962684392929, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0209177732467651, + "sampling/importance_sampling_ratio/min": 8.514979299434344e-07, + "sampling/sampling_logp_difference/max": 13.976268768310547, + "sampling/sampling_logp_difference/mean": 0.06216352805495262, + "step": 389 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.12261833995580673, + "epoch": 1.0263157894736843, + "grad_norm": 0.025751272216439247, + "learning_rate": 1e-06, + "loss": 0.0612, + "step": 390 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.12619221955537796, + "epoch": 1.0289473684210526, + "grad_norm": 0.020393963903188705, + "learning_rate": 1e-06, + "loss": 0.0126, + "step": 391 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.1277061551809311, + "epoch": 1.0315789473684212, + "grad_norm": 0.02457948587834835, + "learning_rate": 1e-06, + "loss": 0.0283, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6283.0, + "completions/mean_length": 1035.68359375, + "completions/mean_terminated_length": 761.0615844726562, + "completions/min_length": 292.0, + "completions/min_terminated_length": 292.0, + "entropy": 0.12381928041577339, + "epoch": 1.0342105263157895, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.0314381942152977, + "learning_rate": 1e-06, + "loss": 0.0236, + "num_tokens": 138999509.0, + "reward": 0.8199691772460938, + "reward_std": 0.13415905833244324, + "rewards/progression_diversity/mean": -0.00015412273933179677, + "rewards/progression_diversity/std": 0.0034873993135988712, + "rewards/symbolic_reward_accuracy/mean": 0.896484375, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.9461262822151184, + "rewards/symbolic_reward_partial_score/std": 0.19489480555057526, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0213356018066406, + "sampling/importance_sampling_ratio/min": 5.044097406425863e-07, + "sampling/sampling_logp_difference/max": 14.499876976013184, + "sampling/sampling_logp_difference/mean": 0.06386812776327133, + "step": 393 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.11941304057836533, + "epoch": 1.0368421052631578, + "grad_norm": 0.007009011693298817, + "learning_rate": 1e-06, + "loss": 0.0269, + "step": 394 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.1252107173204422, + "epoch": 1.0394736842105263, + "grad_norm": 0.018644485622644424, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 395 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.12022094428539276, + "epoch": 1.0421052631578946, + "grad_norm": 0.008900588378310204, + "learning_rate": 1e-06, + "loss": 0.0657, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5305.0, + "completions/mean_length": 1517.716796875, + "completions/mean_terminated_length": 690.1093139648438, + "completions/min_length": 272.0, + "completions/min_terminated_length": 272.0, + "entropy": 0.11808853596448898, + "epoch": 1.0447368421052632, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.033308856189250946, + "learning_rate": 1e-06, + "loss": 0.0487, + "num_tokens": 140177540.0, + "reward": 0.805396318435669, + "reward_std": 0.14244824647903442, + "rewards/progression_diversity/mean": -0.002365510445088148, + "rewards/progression_diversity/std": 0.032508544623851776, + "rewards/symbolic_reward_accuracy/mean": 0.890625, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.9210612177848816, + "rewards/symbolic_reward_partial_score/std": 0.248762309551239, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0201090574264526, + "sampling/importance_sampling_ratio/min": 1.2532452728919452e-06, + "sampling/sampling_logp_difference/max": 13.589774131774902, + "sampling/sampling_logp_difference/mean": 0.05805087089538574, + "step": 397 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.11611740663647652, + "epoch": 1.0473684210526315, + "grad_norm": 0.013962333090603352, + "learning_rate": 1e-06, + "loss": 0.0597, + "step": 398 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.1225820817053318, + "epoch": 1.05, + "grad_norm": 0.023140504956245422, + "learning_rate": 1e-06, + "loss": 0.0526, + "step": 399 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.12173304334282875, + "epoch": 1.0526315789473684, + "grad_norm": 0.014247185550630093, + "learning_rate": 1e-06, + "loss": 0.0829, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9002.0, + "completions/mean_length": 1278.06640625, + "completions/mean_terminated_length": 664.0040283203125, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.12073200196027756, + "epoch": 1.055263157894737, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.030312160030007362, + "learning_rate": 1e-06, + "loss": 0.0684, + "num_tokens": 141226214.0, + "reward": 0.8194763660430908, + "reward_std": 0.1385163962841034, + "rewards/progression_diversity/mean": -0.0006138992612250149, + "rewards/progression_diversity/std": 0.008086828514933586, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.93603515625, + "rewards/symbolic_reward_partial_score/std": 0.22104382514953613, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0207290649414062, + "sampling/importance_sampling_ratio/min": 0.00014015565102454275, + "sampling/sampling_logp_difference/max": 8.872756958007812, + "sampling/sampling_logp_difference/mean": 0.06024109199643135, + "step": 401 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.1248621977865696, + "epoch": 1.0578947368421052, + "grad_norm": 0.010959668084979057, + "learning_rate": 1e-06, + "loss": 0.0659, + "step": 402 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.11883337050676346, + "epoch": 1.0605263157894738, + "grad_norm": 0.024657705798745155, + "learning_rate": 1e-06, + "loss": 0.0655, + "step": 403 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.11919617652893066, + "epoch": 1.063157894736842, + "grad_norm": 0.01765652373433113, + "learning_rate": 1e-06, + "loss": 0.0271, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7545.0, + "completions/mean_length": 1404.330078125, + "completions/mean_terminated_length": 763.6517944335938, + "completions/min_length": 296.0, + "completions/min_terminated_length": 296.0, + "entropy": 0.11879817768931389, + "epoch": 1.0657894736842106, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.012826770544052124, + "learning_rate": 1e-06, + "loss": 0.0739, + "num_tokens": 142359887.0, + "reward": 0.8211853504180908, + "reward_std": 0.14381209015846252, + "rewards/progression_diversity/mean": -0.000610048184171319, + "rewards/progression_diversity/std": 0.00695717241615057, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.9345703125, + "rewards/symbolic_reward_partial_score/std": 0.22934241592884064, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0197585821151733, + "sampling/importance_sampling_ratio/min": 4.271742568562331e-07, + "sampling/sampling_logp_difference/max": 14.6660737991333, + "sampling/sampling_logp_difference/mean": 0.058111920952796936, + "step": 405 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.112503781914711, + "epoch": 1.068421052631579, + "grad_norm": 0.022235997021198273, + "learning_rate": 1e-06, + "loss": 0.0677, + "step": 406 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.11570753157138824, + "epoch": 1.0710526315789473, + "grad_norm": 0.01149500161409378, + "learning_rate": 1e-06, + "loss": 0.0595, + "step": 407 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.11613816022872925, + "epoch": 1.0736842105263158, + "grad_norm": 0.015948958694934845, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4484.0, + "completions/mean_length": 1437.271484375, + "completions/mean_terminated_length": 734.255615234375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.11242419108748436, + "epoch": 1.0763157894736841, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.037394747138023376, + "learning_rate": 1e-06, + "loss": 0.0298, + "num_tokens": 143501082.0, + "reward": 0.8034499883651733, + "reward_std": 0.14993996918201447, + "rewards/progression_diversity/mean": -0.0016794700641185045, + "rewards/progression_diversity/std": 0.01779768615961075, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9236653447151184, + "rewards/symbolic_reward_partial_score/std": 0.24423635005950928, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0196541547775269, + "sampling/importance_sampling_ratio/min": 5.551847425522283e-05, + "sampling/sampling_logp_difference/max": 9.798794746398926, + "sampling/sampling_logp_difference/mean": 0.05961894989013672, + "step": 409 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.10686653479933739, + "epoch": 1.0789473684210527, + "grad_norm": 0.03459122031927109, + "learning_rate": 1e-06, + "loss": 0.1115, + "step": 410 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.10253704711794853, + "epoch": 1.081578947368421, + "grad_norm": 0.017048373818397522, + "learning_rate": 1e-06, + "loss": 0.0259, + "step": 411 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.0990813598036766, + "epoch": 1.0842105263157895, + "grad_norm": 0.03026195615530014, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4737.0, + "completions/mean_length": 1033.18359375, + "completions/mean_terminated_length": 664.7640380859375, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.10128960758447647, + "epoch": 1.0868421052631578, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.029587438330054283, + "learning_rate": 1e-06, + "loss": 0.0237, + "num_tokens": 144440376.0, + "reward": 0.8033533096313477, + "reward_std": 0.17567187547683716, + "rewards/progression_diversity/mean": -0.0015859488630667329, + "rewards/progression_diversity/std": 0.02551618218421936, + "rewards/symbolic_reward_accuracy/mean": 0.876953125, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.9318033456802368, + "rewards/symbolic_reward_partial_score/std": 0.2252691388130188, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0161960124969482, + "sampling/importance_sampling_ratio/min": 1.3736066648561973e-06, + "sampling/sampling_logp_difference/max": 13.49807071685791, + "sampling/sampling_logp_difference/mean": 0.05791211128234863, + "step": 413 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.09996145218610764, + "epoch": 1.0894736842105264, + "grad_norm": 0.013834556564688683, + "learning_rate": 1e-06, + "loss": 0.0171, + "step": 414 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.10281230881810188, + "epoch": 1.0921052631578947, + "grad_norm": 0.02543732523918152, + "learning_rate": 1e-06, + "loss": 0.0327, + "step": 415 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.09636327251791954, + "epoch": 1.0947368421052632, + "grad_norm": 0.008221838623285294, + "learning_rate": 1e-06, + "loss": 0.0774, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5506.0, + "completions/mean_length": 788.818359375, + "completions/mean_terminated_length": 666.0216674804688, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.1014927513897419, + "epoch": 1.0973684210526315, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01914730854332447, + "learning_rate": 1e-06, + "loss": 0.0277, + "num_tokens": 145242203.0, + "reward": 0.8517071008682251, + "reward_std": 0.12221311032772064, + "rewards/progression_diversity/mean": -0.00019061629427596927, + "rewards/progression_diversity/std": 0.004313154611736536, + "rewards/symbolic_reward_accuracy/mean": 0.9375, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.9666340947151184, + "rewards/symbolic_reward_partial_score/std": 0.15885646641254425, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.017521858215332, + "sampling/importance_sampling_ratio/min": 4.664206699089846e-06, + "sampling/sampling_logp_difference/max": 12.275592803955078, + "sampling/sampling_logp_difference/mean": 0.060040805488824844, + "step": 417 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.10259632021188736, + "epoch": 1.1, + "grad_norm": 0.03230629488825798, + "learning_rate": 1e-06, + "loss": 0.0411, + "step": 418 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.10374432802200317, + "epoch": 1.1026315789473684, + "grad_norm": 0.029180046170949936, + "learning_rate": 1e-06, + "loss": 0.0381, + "step": 419 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.10562139376997948, + "epoch": 1.1052631578947367, + "grad_norm": 0.005970039404928684, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7054.0, + "completions/mean_length": 1033.38671875, + "completions/mean_terminated_length": 727.59765625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.10337032377719879, + "epoch": 1.1078947368421053, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.016906937584280968, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 146154753.0, + "reward": 0.8127256631851196, + "reward_std": 0.16144107282161713, + "rewards/progression_diversity/mean": -0.0018486212939023972, + "rewards/progression_diversity/std": 0.030813097953796387, + "rewards/symbolic_reward_accuracy/mean": 0.896484375, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.9226887822151184, + "rewards/symbolic_reward_partial_score/std": 0.25257766246795654, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0161592960357666, + "sampling/importance_sampling_ratio/min": 5.162914021639153e-06, + "sampling/sampling_logp_difference/max": 12.174009323120117, + "sampling/sampling_logp_difference/mean": 0.05756688863039017, + "step": 421 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.09895607084035873, + "epoch": 1.1105263157894736, + "grad_norm": 0.04962952807545662, + "learning_rate": 1e-06, + "loss": 0.0424, + "step": 422 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.10057874768972397, + "epoch": 1.1131578947368421, + "grad_norm": 0.015979913994669914, + "learning_rate": 1e-06, + "loss": 0.0597, + "step": 423 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.10110774263739586, + "epoch": 1.1157894736842104, + "grad_norm": 0.0167376846075058, + "learning_rate": 1e-06, + "loss": 0.0414, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5384.0, + "completions/mean_length": 802.5234375, + "completions/mean_terminated_length": 741.419677734375, + "completions/min_length": 278.0, + "completions/min_terminated_length": 278.0, + "entropy": 0.09795539081096649, + "epoch": 1.118421052631579, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.01597507670521736, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 146970797.0, + "reward": 0.8327491879463196, + "reward_std": 0.15943855047225952, + "rewards/progression_diversity/mean": -0.0014538828982040286, + "rewards/progression_diversity/std": 0.03289761394262314, + "rewards/symbolic_reward_accuracy/mean": 0.912109375, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.9529622197151184, + "rewards/symbolic_reward_partial_score/std": 0.18036773800849915, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0156102180480957, + "sampling/importance_sampling_ratio/min": 2.9369621188379824e-05, + "sampling/sampling_logp_difference/max": 10.43554973602295, + "sampling/sampling_logp_difference/mean": 0.06037828326225281, + "step": 425 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.09897046536207199, + "epoch": 1.1210526315789473, + "grad_norm": 0.034385696053504944, + "learning_rate": 1e-06, + "loss": 0.0032, + "step": 426 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.09649962186813354, + "epoch": 1.1236842105263158, + "grad_norm": 0.01485302671790123, + "learning_rate": 1e-06, + "loss": 0.0068, + "step": 427 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.09620106220245361, + "epoch": 1.1263157894736842, + "grad_norm": 0.019307559356093407, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7232.0, + "completions/mean_length": 914.90234375, + "completions/mean_terminated_length": 731.474365234375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.09814492240548134, + "epoch": 1.1289473684210527, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.021051058545708656, + "learning_rate": 1e-06, + "loss": 0.0122, + "num_tokens": 147822843.0, + "reward": 0.8289062976837158, + "reward_std": 0.11690939962863922, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9466145634651184, + "rewards/symbolic_reward_partial_score/std": 0.19831162691116333, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0156972408294678, + "sampling/importance_sampling_ratio/min": 4.255066414771136e-06, + "sampling/sampling_logp_difference/max": 12.367400169372559, + "sampling/sampling_logp_difference/mean": 0.06155911460518837, + "step": 429 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.10168719664216042, + "epoch": 1.131578947368421, + "grad_norm": 0.02653714269399643, + "learning_rate": 1e-06, + "loss": 0.0208, + "step": 430 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.09842254221439362, + "epoch": 1.1342105263157896, + "grad_norm": 0.010419169440865517, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 431 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.09445172548294067, + "epoch": 1.1368421052631579, + "grad_norm": 0.016496408730745316, + "learning_rate": 1e-06, + "loss": 0.0163, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4974.0, + "completions/mean_length": 910.11328125, + "completions/mean_terminated_length": 695.623779296875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.09750872850418091, + "epoch": 1.1394736842105262, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.01833958737552166, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 148674101.0, + "reward": 0.8388184309005737, + "reward_std": 0.0944415032863617, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.919921875, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.9607747197151184, + "rewards/symbolic_reward_partial_score/std": 0.16794845461845398, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0153504610061646, + "sampling/importance_sampling_ratio/min": 1.6851483541913126e-09, + "sampling/sampling_logp_difference/max": 20.201412200927734, + "sampling/sampling_logp_difference/mean": 0.05861156806349754, + "step": 433 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.09329849109053612, + "epoch": 1.1421052631578947, + "grad_norm": 0.045422203838825226, + "learning_rate": 1e-06, + "loss": 0.0453, + "step": 434 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.09762432053685188, + "epoch": 1.1447368421052633, + "grad_norm": 0.014451306313276291, + "learning_rate": 1e-06, + "loss": 0.0188, + "step": 435 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.09433042258024216, + "epoch": 1.1473684210526316, + "grad_norm": 0.01592213101685047, + "learning_rate": 1e-06, + "loss": 0.0077, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5906.0, + "completions/mean_length": 745.5859375, + "completions/mean_terminated_length": 684.2588500976562, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "entropy": 0.09743008762598038, + "epoch": 1.15, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.027467206120491028, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 149445473.0, + "reward": 0.8571289777755737, + "reward_std": 0.09262826293706894, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.9453125, + "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, + "rewards/symbolic_reward_partial_score/mean": 0.9677734375, + "rewards/symbolic_reward_partial_score/std": 0.1515229344367981, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0158276557922363, + "sampling/importance_sampling_ratio/min": 4.494921683228018e-16, + "sampling/sampling_logp_difference/max": 35.33841323852539, + "sampling/sampling_logp_difference/mean": 0.06052025035023689, + "step": 437 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.09928737580776215, + "epoch": 1.1526315789473685, + "grad_norm": 0.026970867067575455, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 438 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.09795548766851425, + "epoch": 1.1552631578947368, + "grad_norm": 0.014579696580767632, + "learning_rate": 1e-06, + "loss": 0.0073, + "step": 439 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.10043598338961601, + "epoch": 1.1578947368421053, + "grad_norm": 0.00749570457264781, + "learning_rate": 1e-06, + "loss": 0.019, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5394.0, + "completions/mean_length": 1239.98828125, + "completions/mean_terminated_length": 845.4548950195312, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.0946054458618164, + "epoch": 1.1605263157894736, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.030099380761384964, + "learning_rate": 1e-06, + "loss": 0.0194, + "num_tokens": 150507195.0, + "reward": 0.7865234613418579, + "reward_std": 0.19104072451591492, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.9186197519302368, + "rewards/symbolic_reward_partial_score/std": 0.2408839911222458, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0148797035217285, + "sampling/importance_sampling_ratio/min": 3.064930638174701e-07, + "sampling/sampling_logp_difference/max": 14.99807071685791, + "sampling/sampling_logp_difference/mean": 0.05635258927941322, + "step": 441 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.0940646082162857, + "epoch": 1.1631578947368422, + "grad_norm": 0.03301357850432396, + "learning_rate": 1e-06, + "loss": 0.0389, + "step": 442 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.09547659382224083, + "epoch": 1.1657894736842105, + "grad_norm": 0.023627353832125664, + "learning_rate": 1e-06, + "loss": 0.0095, + "step": 443 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.0967930480837822, + "epoch": 1.168421052631579, + "grad_norm": 0.028321361169219017, + "learning_rate": 1e-06, + "loss": 0.0502, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9650.0, + "completions/mean_length": 1090.6171875, + "completions/mean_terminated_length": 878.6297607421875, + "completions/min_length": 287.0, + "completions/min_terminated_length": 287.0, + "entropy": 0.10013857111334801, + "epoch": 1.1710526315789473, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.04974789917469025, + "learning_rate": 1e-06, + "loss": 0.0319, + "num_tokens": 151460055.0, + "reward": 0.800537109375, + "reward_std": 0.16888116300106049, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.873046875, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.92626953125, + "rewards/symbolic_reward_partial_score/std": 0.22773192822933197, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.016984224319458, + "sampling/importance_sampling_ratio/min": 2.9851989324924944e-07, + "sampling/sampling_logp_difference/max": 15.024429321289062, + "sampling/sampling_logp_difference/mean": 0.059770140796899796, + "step": 445 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.09959771484136581, + "epoch": 1.1736842105263159, + "grad_norm": 0.017701053991913795, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 446 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.10232729837298393, + "epoch": 1.1763157894736842, + "grad_norm": 0.02004496566951275, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 447 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.10072638839483261, + "epoch": 1.1789473684210527, + "grad_norm": 0.018607337027788162, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6300.0, + "completions/mean_length": 931.9296875, + "completions/mean_terminated_length": 840.8566284179688, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.10329998284578323, + "epoch": 1.181578947368421, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.039218734949827194, + "learning_rate": 1e-06, + "loss": 0.0256, + "num_tokens": 152344499.0, + "reward": 0.8318839073181152, + "reward_std": 0.14675593376159668, + "rewards/progression_diversity/mean": -8.955165685620159e-05, + "rewards/progression_diversity/std": 0.002026322763413191, + "rewards/symbolic_reward_accuracy/mean": 0.916015625, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.94287109375, + "rewards/symbolic_reward_partial_score/std": 0.2136351764202118, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0176771879196167, + "sampling/importance_sampling_ratio/min": 6.2567501117882784e-06, + "sampling/sampling_logp_difference/max": 11.981849670410156, + "sampling/sampling_logp_difference/mean": 0.06110313534736633, + "step": 449 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.10243717581033707, + "epoch": 1.1842105263157894, + "grad_norm": 0.02117745950818062, + "learning_rate": 1e-06, + "loss": 0.021, + "step": 450 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.10411274060606956, + "epoch": 1.186842105263158, + "grad_norm": 0.02108968421816826, + "learning_rate": 1e-06, + "loss": 0.0114, + "step": 451 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.10935872420668602, + "epoch": 1.1894736842105262, + "grad_norm": 0.007527265697717667, + "learning_rate": 1e-06, + "loss": 0.0099, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7876.0, + "completions/max_terminated_length": 7876.0, + "completions/mean_length": 748.26171875, + "completions/mean_terminated_length": 748.26171875, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.10911908373236656, + "epoch": 1.1921052631578948, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.029693739488720894, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 153135929.0, + "reward": 0.8560059070587158, + "reward_std": 0.12228970229625702, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.943359375, + "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, + "rewards/symbolic_reward_partial_score/mean": 0.9666340947151184, + "rewards/symbolic_reward_partial_score/std": 0.15885646641254425, + "rewards/tag_count_reward/mean": 0.0, + "rewards/tag_count_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0188539028167725, + "sampling/importance_sampling_ratio/min": 1.5778656234033406e-05, + "sampling/sampling_logp_difference/max": 11.056852340698242, + "sampling/sampling_logp_difference/mean": 0.06543545424938202, + "step": 453 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.11097723618149757, + "epoch": 1.194736842105263, + "grad_norm": 0.019065558910369873, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 454 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.11221178621053696, + "epoch": 1.1973684210526316, + "grad_norm": 0.019534561783075333, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 455 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.10779605805873871, + "epoch": 1.2, + "grad_norm": 0.009235396981239319, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6720.0, + "completions/mean_length": 898.009765625, + "completions/mean_terminated_length": 745.2879638671875, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.11065573990345001, + "epoch": 1.2026315789473685, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.012675322592258453, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 154012350.0, + "reward": 0.8206509947776794, + "reward_std": 0.14962440729141235, + "rewards/progression_diversity/mean": -0.00033113209065049887, + "rewards/progression_diversity/std": 0.0074926638044416904, + "rewards/symbolic_reward_accuracy/mean": 0.896484375, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.94580078125, + "rewards/symbolic_reward_partial_score/std": 0.1969550997018814, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0194942951202393, + "sampling/importance_sampling_ratio/min": 1.5589863266995962e-07, + "sampling/sampling_logp_difference/max": 15.674059867858887, + "sampling/sampling_logp_difference/mean": 0.061966508626937866, + "step": 457 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.10747477412223816, + "epoch": 1.2052631578947368, + "grad_norm": 0.023037495091557503, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 458 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.10901957005262375, + "epoch": 1.2078947368421054, + "grad_norm": 0.032999638468027115, + "learning_rate": 1e-06, + "loss": 0.039, + "step": 459 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.1066613681614399, + "epoch": 1.2105263157894737, + "grad_norm": 0.0317135825753212, + "learning_rate": 1e-06, + "loss": 0.0266, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11462.0, + "completions/mean_length": 1336.01953125, + "completions/mean_terminated_length": 912.98388671875, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.11170916259288788, + "epoch": 1.2131578947368422, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.027878640219569206, + "learning_rate": 1e-06, + "loss": 0.0273, + "num_tokens": 155114376.0, + "reward": 0.7967281341552734, + "reward_std": 0.12850847840309143, + "rewards/progression_diversity/mean": -4.386279761092737e-05, + "rewards/progression_diversity/std": 0.0009925017366185784, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.9148762822151184, + "rewards/symbolic_reward_partial_score/std": 0.25462573766708374, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0198333263397217, + "sampling/importance_sampling_ratio/min": 3.876455139106838e-06, + "sampling/sampling_logp_difference/max": 12.460589408874512, + "sampling/sampling_logp_difference/mean": 0.06402070820331573, + "step": 461 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.11339271068572998, + "epoch": 1.2157894736842105, + "grad_norm": 0.02045866660773754, + "learning_rate": 1e-06, + "loss": 0.0186, + "step": 462 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.11221589148044586, + "epoch": 1.2184210526315788, + "grad_norm": 0.011421293020248413, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 463 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.11195248365402222, + "epoch": 1.2210526315789474, + "grad_norm": 0.016839897260069847, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7398.0, + "completions/mean_length": 822.09375, + "completions/mean_terminated_length": 761.0667114257812, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.11619088798761368, + "epoch": 1.2236842105263157, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.045058924704790115, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 155939256.0, + "reward": 0.8314453363418579, + "reward_std": 0.14568641781806946, + "rewards/progression_diversity/mean": 0.0, + "rewards/progression_diversity/std": 0.0, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9524739384651184, + "rewards/symbolic_reward_partial_score/std": 0.1864290088415146, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0201449394226074, + "sampling/importance_sampling_ratio/min": 7.38695504765019e-08, + "sampling/sampling_logp_difference/max": 16.42096519470215, + "sampling/sampling_logp_difference/mean": 0.06559212505817413, + "step": 465 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.11323034018278122, + "epoch": 1.2263157894736842, + "grad_norm": 0.018060827627778053, + "learning_rate": 1e-06, + "loss": 0.0557, + "step": 466 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.11986911296844482, + "epoch": 1.2289473684210526, + "grad_norm": 0.006362342741340399, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 467 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.12106319144368172, + "epoch": 1.231578947368421, + "grad_norm": 0.0218115895986557, + "learning_rate": 1e-06, + "loss": 0.0058, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12412.0, + "completions/mean_length": 1080.7890625, + "completions/mean_terminated_length": 899.328125, + "completions/min_length": 309.0, + "completions/min_terminated_length": 309.0, + "entropy": 0.12077518552541733, + "epoch": 1.2342105263157894, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.023888949304819107, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 156925804.0, + "reward": 0.8068230152130127, + "reward_std": 0.18419486284255981, + "rewards/progression_diversity/mean": -0.0012958223232999444, + "rewards/progression_diversity/std": 0.029321111738681793, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9231771230697632, + "rewards/symbolic_reward_partial_score/std": 0.24108576774597168, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0196768045425415, + "sampling/importance_sampling_ratio/min": 8.88462352577335e-08, + "sampling/sampling_logp_difference/max": 16.236358642578125, + "sampling/sampling_logp_difference/mean": 0.0635080635547638, + "step": 469 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.11772556602954865, + "epoch": 1.236842105263158, + "grad_norm": 0.019387010484933853, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 470 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.11631882935762405, + "epoch": 1.2394736842105263, + "grad_norm": 0.02116510644555092, + "learning_rate": 1e-06, + "loss": 0.0205, + "step": 471 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.11524809151887894, + "epoch": 1.2421052631578948, + "grad_norm": 0.017463896423578262, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7361.0, + "completions/mean_length": 927.275390625, + "completions/mean_terminated_length": 805.5689086914062, + "completions/min_length": 269.0, + "completions/min_terminated_length": 269.0, + "entropy": 0.12217172235250473, + "epoch": 1.2447368421052631, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.030732987448573112, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 157807385.0, + "reward": 0.8150861263275146, + "reward_std": 0.15566140413284302, + "rewards/progression_diversity/mean": -0.0001790223177522421, + "rewards/progression_diversity/std": 0.004050812683999538, + "rewards/symbolic_reward_accuracy/mean": 0.888671875, + "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, + "rewards/symbolic_reward_partial_score/mean": 0.9422200322151184, + "rewards/symbolic_reward_partial_score/std": 0.19841378927230835, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0209197998046875, + "sampling/importance_sampling_ratio/min": 4.93685820401879e-06, + "sampling/sampling_logp_difference/max": 12.218781471252441, + "sampling/sampling_logp_difference/mean": 0.06566841155290604, + "step": 473 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.12189845740795135, + "epoch": 1.2473684210526317, + "grad_norm": 0.023981690406799316, + "learning_rate": 1e-06, + "loss": 0.03, + "step": 474 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.1238529235124588, + "epoch": 1.25, + "grad_norm": 0.02387721836566925, + "learning_rate": 1e-06, + "loss": 0.0418, + "step": 475 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.12588384374976158, + "epoch": 1.2526315789473683, + "grad_norm": 0.021237516775727272, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9261.0, + "completions/mean_length": 899.77734375, + "completions/mean_terminated_length": 716.1699829101562, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.13065219670534134, + "epoch": 1.2552631578947369, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.0358455665409565, + "learning_rate": 1e-06, + "loss": 0.0222, + "num_tokens": 158659847.0, + "reward": 0.8351035714149475, + "reward_std": 0.10815407335758209, + "rewards/progression_diversity/mean": -0.0003900247684214264, + "rewards/progression_diversity/std": 0.008825253695249557, + "rewards/symbolic_reward_accuracy/mean": 0.91796875, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.9510090947151184, + "rewards/symbolic_reward_partial_score/std": 0.1945120096206665, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0229235887527466, + "sampling/importance_sampling_ratio/min": 4.0339618863072246e-05, + "sampling/sampling_logp_difference/max": 10.118176460266113, + "sampling/sampling_logp_difference/mean": 0.06817199289798737, + "step": 477 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.13009684532880783, + "epoch": 1.2578947368421054, + "grad_norm": 0.017281439155340195, + "learning_rate": 1e-06, + "loss": 0.0275, + "step": 478 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.12655635178089142, + "epoch": 1.2605263157894737, + "grad_norm": 0.011967520229518414, + "learning_rate": 1e-06, + "loss": 0.0433, + "step": 479 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.13125859946012497, + "epoch": 1.263157894736842, + "grad_norm": 0.010364379733800888, + "learning_rate": 1e-06, + "loss": 0.0029, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11268.0, + "completions/mean_length": 1359.4765625, + "completions/mean_terminated_length": 780.4381103515625, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "entropy": 0.123091921210289, + "epoch": 1.2657894736842106, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.03727136552333832, + "learning_rate": 1e-06, + "loss": 0.0505, + "num_tokens": 159759867.0, + "reward": 0.8022311925888062, + "reward_std": 0.1121816635131836, + "rewards/progression_diversity/mean": -0.001497793011367321, + "rewards/progression_diversity/std": 0.01563715748488903, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9208984375, + "rewards/symbolic_reward_partial_score/std": 0.2492290735244751, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0214684009552002, + "sampling/importance_sampling_ratio/min": 1.330551033934535e-11, + "sampling/sampling_logp_difference/max": 25.042842864990234, + "sampling/sampling_logp_difference/mean": 0.06472167372703552, + "step": 481 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.12072613090276718, + "epoch": 1.268421052631579, + "grad_norm": 0.022708555683493614, + "learning_rate": 1e-06, + "loss": 0.0388, + "step": 482 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.12625328078866005, + "epoch": 1.2710526315789474, + "grad_norm": 0.021841494366526604, + "learning_rate": 1e-06, + "loss": 0.0393, + "step": 483 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.12154950946569443, + "epoch": 1.2736842105263158, + "grad_norm": 0.009138455614447594, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4483.0, + "completions/mean_length": 1131.763671875, + "completions/mean_terminated_length": 671.4345703125, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.12409983202815056, + "epoch": 1.2763157894736843, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.025687798857688904, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 160721602.0, + "reward": 0.8422271013259888, + "reward_std": 0.07236142456531525, + "rewards/progression_diversity/mean": -0.0009313340415246785, + "rewards/progression_diversity/std": 0.01315159909427166, + "rewards/symbolic_reward_accuracy/mean": 0.931640625, + "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, + "rewards/symbolic_reward_partial_score/mean": 0.9539387822151184, + "rewards/symbolic_reward_partial_score/std": 0.19256454706192017, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0208799839019775, + "sampling/importance_sampling_ratio/min": 0.000312736548949033, + "sampling/sampling_logp_difference/max": 8.070149421691895, + "sampling/sampling_logp_difference/mean": 0.06437121331691742, + "step": 485 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.11737996339797974, + "epoch": 1.2789473684210526, + "grad_norm": 0.024496793746948242, + "learning_rate": 1e-06, + "loss": 0.0558, + "step": 486 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.078125, + "entropy": 0.11956480145454407, + "epoch": 1.2815789473684212, + "grad_norm": 0.010835636407136917, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 487 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.1189895048737526, + "epoch": 1.2842105263157895, + "grad_norm": 0.011718123219907284, + "learning_rate": 1e-06, + "loss": 0.0353, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5296.0, + "completions/mean_length": 1337.478515625, + "completions/mean_terminated_length": 789.2247314453125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.11463934555649757, + "epoch": 1.2868421052631578, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.04130804166197777, + "learning_rate": 1e-06, + "loss": 0.0829, + "num_tokens": 161806871.0, + "reward": 0.7914862632751465, + "reward_std": 0.1913241147994995, + "rewards/progression_diversity/mean": -0.0017659981967881322, + "rewards/progression_diversity/std": 0.017526477575302124, + "rewards/symbolic_reward_accuracy/mean": 0.8671875, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.9156900644302368, + "rewards/symbolic_reward_partial_score/std": 0.2554028630256653, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.018146276473999, + "sampling/importance_sampling_ratio/min": 1.5589847635055776e-07, + "sampling/sampling_logp_difference/max": 15.674060821533203, + "sampling/sampling_logp_difference/mean": 0.06120399013161659, + "step": 489 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.11538948118686676, + "epoch": 1.2894736842105263, + "grad_norm": 0.013879453763365746, + "learning_rate": 1e-06, + "loss": 0.0104, + "step": 490 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.11806956678628922, + "epoch": 1.2921052631578949, + "grad_norm": 0.012860557064414024, + "learning_rate": 1e-06, + "loss": 0.0499, + "step": 491 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.1125846691429615, + "epoch": 1.2947368421052632, + "grad_norm": 0.02000536397099495, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8706.0, + "completions/mean_length": 924.626953125, + "completions/mean_terminated_length": 741.3142700195312, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.1223670057952404, + "epoch": 1.2973684210526315, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.014644470997154713, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 162687256.0, + "reward": 0.8528740406036377, + "reward_std": 0.12011945247650146, + "rewards/progression_diversity/mean": -0.000687220657709986, + "rewards/progression_diversity/std": 0.013531150296330452, + "rewards/symbolic_reward_accuracy/mean": 0.94140625, + "rewards/symbolic_reward_accuracy/std": 0.23509246110916138, + "rewards/symbolic_reward_partial_score/mean": 0.96337890625, + "rewards/symbolic_reward_partial_score/std": 0.17117686569690704, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0194642543792725, + "sampling/importance_sampling_ratio/min": 2.044983940790368e-21, + "sampling/sampling_logp_difference/max": 47.63889694213867, + "sampling/sampling_logp_difference/mean": 0.06407853960990906, + "step": 493 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.1194252297282219, + "epoch": 1.3, + "grad_norm": 0.00824655033648014, + "learning_rate": 1e-06, + "loss": 0.033, + "step": 494 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.11969351023435593, + "epoch": 1.3026315789473684, + "grad_norm": 0.05345854163169861, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 495 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.11757103353738785, + "epoch": 1.305263157894737, + "grad_norm": 0.00989691074937582, + "learning_rate": 1e-06, + "loss": -0.0039, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9763.0, + "completions/mean_length": 871.619140625, + "completions/mean_terminated_length": 780.1906127929688, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.1206367239356041, + "epoch": 1.3078947368421052, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.048302192240953445, + "learning_rate": 1e-06, + "loss": 0.0334, + "num_tokens": 163535829.0, + "reward": 0.830558180809021, + "reward_std": 0.14636383950710297, + "rewards/progression_diversity/mean": -0.000826410308945924, + "rewards/progression_diversity/std": 0.01099415123462677, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9501953125, + "rewards/symbolic_reward_partial_score/std": 0.1905975341796875, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.020845651626587, + "sampling/importance_sampling_ratio/min": 1.0631980984499023e-07, + "sampling/sampling_logp_difference/max": 16.056814193725586, + "sampling/sampling_logp_difference/mean": 0.06594814360141754, + "step": 497 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.1199469231069088, + "epoch": 1.3105263157894738, + "grad_norm": 0.013368850573897362, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 498 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.12389621138572693, + "epoch": 1.313157894736842, + "grad_norm": 0.015475841239094734, + "learning_rate": 1e-06, + "loss": 0.0216, + "step": 499 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.12212049216032028, + "epoch": 1.3157894736842106, + "grad_norm": 0.018125230446457863, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6161.0, + "completions/mean_length": 1061.85546875, + "completions/mean_terminated_length": 725.4411010742188, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.12253501266241074, + "epoch": 1.318421052631579, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.031911756843328476, + "learning_rate": 1e-06, + "loss": 0.0289, + "num_tokens": 164483307.0, + "reward": 0.8256685137748718, + "reward_std": 0.12521421909332275, + "rewards/progression_diversity/mean": -0.0015109577216207981, + "rewards/progression_diversity/std": 0.028086457401514053, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.9423828125, + "rewards/symbolic_reward_partial_score/std": 0.20720478892326355, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.019927740097046, + "sampling/importance_sampling_ratio/min": 7.929816092655528e-06, + "sampling/sampling_logp_difference/max": 11.744880676269531, + "sampling/sampling_logp_difference/mean": 0.06401927769184113, + "step": 501 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.11759228631854057, + "epoch": 1.3210526315789473, + "grad_norm": 0.14602532982826233, + "learning_rate": 1e-06, + "loss": 0.0659, + "step": 502 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.12064041569828987, + "epoch": 1.3236842105263158, + "grad_norm": 0.01236443966627121, + "learning_rate": 1e-06, + "loss": -0.0013, + "step": 503 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.12389854341745377, + "epoch": 1.3263157894736843, + "grad_norm": 0.015112272463738918, + "learning_rate": 1e-06, + "loss": -0.0001, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6364.0, + "completions/mean_length": 1221.919921875, + "completions/mean_terminated_length": 826.9158325195312, + "completions/min_length": 300.0, + "completions/min_terminated_length": 300.0, + "entropy": 0.12016144394874573, + "epoch": 1.3289473684210527, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.019961973652243614, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 165545602.0, + "reward": 0.8171666860580444, + "reward_std": 0.16274571418762207, + "rewards/progression_diversity/mean": -0.0020828458946198225, + "rewards/progression_diversity/std": 0.021263638511300087, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.9231770634651184, + "rewards/symbolic_reward_partial_score/std": 0.2538268566131592, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0189927816390991, + "sampling/importance_sampling_ratio/min": 2.001787464678273e-07, + "sampling/sampling_logp_difference/max": 15.424055099487305, + "sampling/sampling_logp_difference/mean": 0.06025902181863785, + "step": 505 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.1265234798192978, + "epoch": 1.331578947368421, + "grad_norm": 0.014271781779825687, + "learning_rate": 1e-06, + "loss": 0.0147, + "step": 506 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.11439178138971329, + "epoch": 1.3342105263157895, + "grad_norm": 0.011770401149988174, + "learning_rate": 1e-06, + "loss": 0.0508, + "step": 507 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.11405204609036446, + "epoch": 1.3368421052631578, + "grad_norm": 0.013367000967264175, + "learning_rate": 1e-06, + "loss": 0.0757, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6573.0, + "completions/mean_length": 1136.01171875, + "completions/mean_terminated_length": 832.2669677734375, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.11793714761734009, + "epoch": 1.3394736842105264, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.030582962557673454, + "learning_rate": 1e-06, + "loss": 0.0369, + "num_tokens": 166528040.0, + "reward": 0.8212710022926331, + "reward_std": 0.13752031326293945, + "rewards/progression_diversity/mean": -0.0018108001677319407, + "rewards/progression_diversity/std": 0.021729890257120132, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.935546875, + "rewards/symbolic_reward_partial_score/std": 0.22706012427806854, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0203887224197388, + "sampling/importance_sampling_ratio/min": 2.9788969186483882e-05, + "sampling/sampling_logp_difference/max": 10.421372413635254, + "sampling/sampling_logp_difference/mean": 0.06702390313148499, + "step": 509 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.11901584640145302, + "epoch": 1.3421052631578947, + "grad_norm": 0.0400007963180542, + "learning_rate": 1e-06, + "loss": 0.0253, + "step": 510 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.11924078688025475, + "epoch": 1.3447368421052632, + "grad_norm": 0.02401769533753395, + "learning_rate": 1e-06, + "loss": 0.0038, + "step": 511 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.12153895944356918, + "epoch": 1.3473684210526315, + "grad_norm": 0.0278632752597332, + "learning_rate": 1e-06, + "loss": 0.0202, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6421.0, + "completions/mean_length": 1049.525390625, + "completions/mean_terminated_length": 712.84033203125, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.1217559427022934, + "epoch": 1.35, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.01790975034236908, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 167447669.0, + "reward": 0.8032069802284241, + "reward_std": 0.15127655863761902, + "rewards/progression_diversity/mean": -0.0015715567860752344, + "rewards/progression_diversity/std": 0.016383837908506393, + "rewards/symbolic_reward_accuracy/mean": 0.87890625, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.9267578125, + "rewards/symbolic_reward_partial_score/std": 0.23392236232757568, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0203694105148315, + "sampling/importance_sampling_ratio/min": 7.359984124377661e-07, + "sampling/sampling_logp_difference/max": 14.122037887573242, + "sampling/sampling_logp_difference/mean": 0.06768319010734558, + "step": 513 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.1187475174665451, + "epoch": 1.3526315789473684, + "grad_norm": 0.009799002669751644, + "learning_rate": 1e-06, + "loss": 0.0417, + "step": 514 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.11943993717432022, + "epoch": 1.3552631578947367, + "grad_norm": 0.020848069339990616, + "learning_rate": 1e-06, + "loss": 0.0084, + "step": 515 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.12406942620873451, + "epoch": 1.3578947368421053, + "grad_norm": 0.019777188077569008, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11133.0, + "completions/mean_length": 1168.974609375, + "completions/mean_terminated_length": 834.9121704101562, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "entropy": 0.1224772073328495, + "epoch": 1.3605263157894738, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.021967625245451927, + "learning_rate": 1e-06, + "loss": 0.0293, + "num_tokens": 168440488.0, + "reward": 0.8120394945144653, + "reward_std": 0.16349293291568756, + "rewards/progression_diversity/mean": -0.0021093892864882946, + "rewards/progression_diversity/std": 0.01896587759256363, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.9249674081802368, + "rewards/symbolic_reward_partial_score/std": 0.24613560736179352, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0184623003005981, + "sampling/importance_sampling_ratio/min": 1.3710611028727726e-06, + "sampling/sampling_logp_difference/max": 13.49992561340332, + "sampling/sampling_logp_difference/mean": 0.06050651893019676, + "step": 517 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.11167807132005692, + "epoch": 1.3631578947368421, + "grad_norm": 0.03911877050995827, + "learning_rate": 1e-06, + "loss": 0.083, + "step": 518 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.11478234827518463, + "epoch": 1.3657894736842104, + "grad_norm": 0.020847424864768982, + "learning_rate": 1e-06, + "loss": 0.0529, + "step": 519 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.11543101072311401, + "epoch": 1.368421052631579, + "grad_norm": 0.018829762935638428, + "learning_rate": 1e-06, + "loss": 0.0307, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6398.0, + "completions/mean_length": 1195.33984375, + "completions/mean_terminated_length": 736.9295654296875, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "entropy": 0.12062574923038483, + "epoch": 1.3710526315789473, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.014021288603544235, + "learning_rate": 1e-06, + "loss": 0.0427, + "num_tokens": 169450134.0, + "reward": 0.7832722663879395, + "reward_std": 0.13144497573375702, + "rewards/progression_diversity/mean": -0.002853758167475462, + "rewards/progression_diversity/std": 0.023874642327427864, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.9137369394302368, + "rewards/symbolic_reward_partial_score/std": 0.2524980306625366, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0197254419326782, + "sampling/importance_sampling_ratio/min": 1.406861771338595e-16, + "sampling/sampling_logp_difference/max": 36.5, + "sampling/sampling_logp_difference/mean": 0.064872145652771, + "step": 521 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.12126883119344711, + "epoch": 1.3736842105263158, + "grad_norm": 0.018665987998247147, + "learning_rate": 1e-06, + "loss": 0.0304, + "step": 522 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.1257994957268238, + "epoch": 1.3763157894736842, + "grad_norm": 0.028348246589303017, + "learning_rate": 1e-06, + "loss": 0.0122, + "step": 523 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.12155702710151672, + "epoch": 1.3789473684210527, + "grad_norm": 0.027928415685892105, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15986.0, + "completions/mean_length": 1216.671875, + "completions/mean_terminated_length": 758.9053955078125, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.12585609033703804, + "epoch": 1.381578947368421, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.020609745755791664, + "learning_rate": 1e-06, + "loss": 0.0281, + "num_tokens": 170497390.0, + "reward": 0.8209607601165771, + "reward_std": 0.13462477922439575, + "rewards/progression_diversity/mean": -0.003541269339621067, + "rewards/progression_diversity/std": 0.028825946152210236, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.9378255009651184, + "rewards/symbolic_reward_partial_score/std": 0.22360049188137054, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0208656787872314, + "sampling/importance_sampling_ratio/min": 2.124352249666117e-07, + "sampling/sampling_logp_difference/max": 15.364628791809082, + "sampling/sampling_logp_difference/mean": 0.06426618993282318, + "step": 525 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.1270783357322216, + "epoch": 1.3842105263157896, + "grad_norm": 0.015202553011476994, + "learning_rate": 1e-06, + "loss": 0.0751, + "step": 526 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.13192099332809448, + "epoch": 1.3868421052631579, + "grad_norm": 0.02699258364737034, + "learning_rate": 1e-06, + "loss": 0.022, + "step": 527 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.13195763528347015, + "epoch": 1.3894736842105262, + "grad_norm": 0.02506939321756363, + "learning_rate": 1e-06, + "loss": 0.0573, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5820.0, + "completions/mean_length": 1409.9921875, + "completions/mean_terminated_length": 705.6932373046875, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.132181815803051, + "epoch": 1.3921052631578947, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.015631964430212975, + "learning_rate": 1e-06, + "loss": 0.0212, + "num_tokens": 171635146.0, + "reward": 0.7929407954216003, + "reward_std": 0.14727577567100525, + "rewards/progression_diversity/mean": -0.007682529743760824, + "rewards/progression_diversity/std": 0.043747011572122574, + "rewards/symbolic_reward_accuracy/mean": 0.8671875, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.9239909052848816, + "rewards/symbolic_reward_partial_score/std": 0.23265688121318817, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0227715969085693, + "sampling/importance_sampling_ratio/min": 1.4806555270752142e-07, + "sampling/sampling_logp_difference/max": 15.725610733032227, + "sampling/sampling_logp_difference/mean": 0.06556607782840729, + "step": 529 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.12692655250430107, + "epoch": 1.3947368421052633, + "grad_norm": 0.013282880187034607, + "learning_rate": 1e-06, + "loss": 0.0761, + "step": 530 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.13208907842636108, + "epoch": 1.3973684210526316, + "grad_norm": 0.01377064548432827, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 531 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.13234106451272964, + "epoch": 1.4, + "grad_norm": 0.020579254254698753, + "learning_rate": 1e-06, + "loss": 0.0348, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4966.0, + "completions/mean_length": 1032.94921875, + "completions/mean_terminated_length": 695.9002075195312, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.12633076682686806, + "epoch": 1.4026315789473685, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.039995189756155014, + "learning_rate": 1e-06, + "loss": 0.101, + "num_tokens": 172575824.0, + "reward": 0.8291662931442261, + "reward_std": 0.12353667616844177, + "rewards/progression_diversity/mean": -0.003291813191026449, + "rewards/progression_diversity/std": 0.029986631125211716, + "rewards/symbolic_reward_accuracy/mean": 0.912109375, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.9469400644302368, + "rewards/symbolic_reward_partial_score/std": 0.20057909190654755, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0211803913116455, + "sampling/importance_sampling_ratio/min": 8.730064109840896e-06, + "sampling/sampling_logp_difference/max": 11.648737907409668, + "sampling/sampling_logp_difference/mean": 0.0656655952334404, + "step": 533 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.13137900829315186, + "epoch": 1.4052631578947368, + "grad_norm": 0.018619263544678688, + "learning_rate": 1e-06, + "loss": -0.0028, + "step": 534 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.1252904236316681, + "epoch": 1.4078947368421053, + "grad_norm": 0.015142420306801796, + "learning_rate": 1e-06, + "loss": 0.0256, + "step": 535 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.13380999118089676, + "epoch": 1.4105263157894736, + "grad_norm": 0.0105243269354105, + "learning_rate": 1e-06, + "loss": 0.0065, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3757.0, + "completions/mean_length": 1088.859375, + "completions/mean_terminated_length": 658.87548828125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.13350340723991394, + "epoch": 1.4131578947368422, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.03917808085680008, + "learning_rate": 1e-06, + "loss": 0.0387, + "num_tokens": 173525096.0, + "reward": 0.8361979722976685, + "reward_std": 0.11943801492452621, + "rewards/progression_diversity/mean": -0.0032509397715330124, + "rewards/progression_diversity/std": 0.02727237343788147, + "rewards/symbolic_reward_accuracy/mean": 0.923828125, + "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, + "rewards/symbolic_reward_partial_score/mean": 0.9488931894302368, + "rewards/symbolic_reward_partial_score/std": 0.2026350498199463, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0211467742919922, + "sampling/importance_sampling_ratio/min": 5.162648449186236e-06, + "sampling/sampling_logp_difference/max": 12.174060821533203, + "sampling/sampling_logp_difference/mean": 0.0664573684334755, + "step": 537 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.13789378851652145, + "epoch": 1.4157894736842105, + "grad_norm": 0.008778770454227924, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 538 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.13081741333007812, + "epoch": 1.418421052631579, + "grad_norm": 0.019565429538488388, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 539 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.12705697864294052, + "epoch": 1.4210526315789473, + "grad_norm": 0.020435446873307228, + "learning_rate": 1e-06, + "loss": 0.0642, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7683.0, + "completions/mean_length": 1142.486328125, + "completions/mean_terminated_length": 619.0404052734375, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.12402678281068802, + "epoch": 1.4236842105263157, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.022379858419299126, + "learning_rate": 1e-06, + "loss": 0.0262, + "num_tokens": 174504833.0, + "reward": 0.8272143006324768, + "reward_std": 0.06737488508224487, + "rewards/progression_diversity/mean": -0.0031863353215157986, + "rewards/progression_diversity/std": 0.028556160628795624, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9475911259651184, + "rewards/symbolic_reward_partial_score/std": 0.2016286551952362, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0205636024475098, + "sampling/importance_sampling_ratio/min": 2.388037658853444e-15, + "sampling/sampling_logp_difference/max": 33.668304443359375, + "sampling/sampling_logp_difference/mean": 0.06742183864116669, + "step": 541 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.13175051659345627, + "epoch": 1.4263157894736842, + "grad_norm": 0.009576407261192799, + "learning_rate": 1e-06, + "loss": 0.0465, + "step": 542 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.12570127844810486, + "epoch": 1.4289473684210527, + "grad_norm": 0.011829288676381111, + "learning_rate": 1e-06, + "loss": 0.0597, + "step": 543 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.1273176595568657, + "epoch": 1.431578947368421, + "grad_norm": 0.01299221906810999, + "learning_rate": 1e-06, + "loss": 0.0145, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16202.0, + "completions/mean_length": 961.04296875, + "completions/mean_terminated_length": 622.4151611328125, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.11435278132557869, + "epoch": 1.4342105263157894, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.030455466359853745, + "learning_rate": 1e-06, + "loss": 0.0736, + "num_tokens": 175375831.0, + "reward": 0.8315600156784058, + "reward_std": 0.10956034064292908, + "rewards/progression_diversity/mean": -0.00318007729947567, + "rewards/progression_diversity/std": 0.026876848191022873, + "rewards/symbolic_reward_accuracy/mean": 0.9140625, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.9510090947151184, + "rewards/symbolic_reward_partial_score/std": 0.19548769295215607, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0186996459960938, + "sampling/importance_sampling_ratio/min": 6.467719504144043e-05, + "sampling/sampling_logp_difference/max": 9.646101951599121, + "sampling/sampling_logp_difference/mean": 0.06452836096286774, + "step": 545 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.1248774528503418, + "epoch": 1.436842105263158, + "grad_norm": 0.030380593612790108, + "learning_rate": 1e-06, + "loss": 0.0411, + "step": 546 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.11489719897508621, + "epoch": 1.4394736842105262, + "grad_norm": 0.013684880919754505, + "learning_rate": 1e-06, + "loss": 0.031, + "step": 547 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.12723349034786224, + "epoch": 1.4421052631578948, + "grad_norm": 0.005004690028727055, + "learning_rate": 1e-06, + "loss": 0.0033, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7127.0, + "completions/mean_length": 1883.12109375, + "completions/mean_terminated_length": 720.6033325195312, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.11534935608506203, + "epoch": 1.444736842105263, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.0357719361782074, + "learning_rate": 1e-06, + "loss": 0.064, + "num_tokens": 176770005.0, + "reward": 0.7715877294540405, + "reward_std": 0.17539432644844055, + "rewards/progression_diversity/mean": -0.009200896136462688, + "rewards/progression_diversity/std": 0.044748030602931976, + "rewards/symbolic_reward_accuracy/mean": 0.849609375, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.8977864980697632, + "rewards/symbolic_reward_partial_score/std": 0.2720419764518738, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.017478108406067, + "sampling/importance_sampling_ratio/min": 2.1775213099317625e-05, + "sampling/sampling_logp_difference/max": 10.73473834991455, + "sampling/sampling_logp_difference/mean": 0.05594378709793091, + "step": 549 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.11893630772829056, + "epoch": 1.4473684210526316, + "grad_norm": 0.020856492221355438, + "learning_rate": 1e-06, + "loss": 0.0626, + "step": 550 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.10961251333355904, + "epoch": 1.45, + "grad_norm": 0.032535944133996964, + "learning_rate": 1e-06, + "loss": 0.1122, + "step": 551 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.12273216247558594, + "epoch": 1.4526315789473685, + "grad_norm": 0.015543154440820217, + "learning_rate": 1e-06, + "loss": 0.0734, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3752.0, + "completions/mean_length": 1222.4609375, + "completions/mean_terminated_length": 638.1419677734375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.12245375290513039, + "epoch": 1.4552631578947368, + "frac_reward_zero_std": 0.65625, + "grad_norm": 0.010475813411176205, + "learning_rate": 1e-06, + "loss": 0.0767, + "num_tokens": 177793537.0, + "reward": 0.8133152723312378, + "reward_std": 0.0920829027891159, + "rewards/progression_diversity/mean": -0.006364539731293917, + "rewards/progression_diversity/std": 0.04115993529558182, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.9345703125, + "rewards/symbolic_reward_partial_score/std": 0.2200278490781784, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0192129611968994, + "sampling/importance_sampling_ratio/min": 5.405944246937577e-13, + "sampling/sampling_logp_difference/max": 28.24610710144043, + "sampling/sampling_logp_difference/mean": 0.06582315266132355, + "step": 553 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.1257517747581005, + "epoch": 1.4578947368421051, + "grad_norm": 0.010606672614812851, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 554 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.1262059360742569, + "epoch": 1.4605263157894737, + "grad_norm": 0.027410009875893593, + "learning_rate": 1e-06, + "loss": 0.05, + "step": 555 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.12493745982646942, + "epoch": 1.4631578947368422, + "grad_norm": 0.011104092933237553, + "learning_rate": 1e-06, + "loss": 0.0094, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6435.0, + "completions/mean_length": 829.318359375, + "completions/mean_terminated_length": 644.8755493164062, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.1371982842683792, + "epoch": 1.4657894736842105, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.05659002810716629, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 178611268.0, + "reward": 0.8500331044197083, + "reward_std": 0.11168558895587921, + "rewards/progression_diversity/mean": -0.001577683025971055, + "rewards/progression_diversity/std": 0.019814975559711456, + "rewards/symbolic_reward_accuracy/mean": 0.9375, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.96240234375, + "rewards/symbolic_reward_partial_score/std": 0.17325447499752045, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0206682682037354, + "sampling/importance_sampling_ratio/min": 8.104348694359942e-07, + "sampling/sampling_logp_difference/max": 14.025694847106934, + "sampling/sampling_logp_difference/mean": 0.0689624696969986, + "step": 557 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.13341526687145233, + "epoch": 1.4684210526315788, + "grad_norm": 0.010691308416426182, + "learning_rate": 1e-06, + "loss": -0.005, + "step": 558 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.13226507604122162, + "epoch": 1.4710526315789474, + "grad_norm": 0.01808001846075058, + "learning_rate": 1e-06, + "loss": 0.0304, + "step": 559 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.12875394523143768, + "epoch": 1.4736842105263157, + "grad_norm": 0.01617010124027729, + "learning_rate": 1e-06, + "loss": 0.0914, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5451.0, + "completions/mean_length": 817.533203125, + "completions/mean_terminated_length": 664.0177612304688, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.1301540583372116, + "epoch": 1.4763157894736842, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.026065807789564133, + "learning_rate": 1e-06, + "loss": 0.0329, + "num_tokens": 179427317.0, + "reward": 0.8277164697647095, + "reward_std": 0.10951961576938629, + "rewards/progression_diversity/mean": -0.0017917966470122337, + "rewards/progression_diversity/std": 0.020995857194066048, + "rewards/symbolic_reward_accuracy/mean": 0.900390625, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.9615885019302368, + "rewards/symbolic_reward_partial_score/std": 0.14556662738323212, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0219957828521729, + "sampling/importance_sampling_ratio/min": 3.670751175377518e-05, + "sampling/sampling_logp_difference/max": 10.212529182434082, + "sampling/sampling_logp_difference/mean": 0.0698399469256401, + "step": 561 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.13334877789020538, + "epoch": 1.4789473684210526, + "grad_norm": 0.008394371718168259, + "learning_rate": 1e-06, + "loss": 0.0013, + "step": 562 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.13843107223510742, + "epoch": 1.481578947368421, + "grad_norm": 0.04058850556612015, + "learning_rate": 1e-06, + "loss": 0.016, + "step": 563 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.13118517398834229, + "epoch": 1.4842105263157894, + "grad_norm": 0.01614241674542427, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6531.0, + "completions/mean_length": 730.0703125, + "completions/mean_terminated_length": 668.682373046875, + "completions/min_length": 258.0, + "completions/min_terminated_length": 258.0, + "entropy": 0.1351623460650444, + "epoch": 1.486842105263158, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.01471333485096693, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 180205241.0, + "reward": 0.8588860034942627, + "reward_std": 0.09307113289833069, + "rewards/progression_diversity/mean": -7.880153134465218e-05, + "rewards/progression_diversity/std": 0.0017830750439316034, + "rewards/symbolic_reward_accuracy/mean": 0.9453125, + "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, + "rewards/symbolic_reward_partial_score/mean": 0.9736328125, + "rewards/symbolic_reward_partial_score/std": 0.13708151876926422, + "rewards/tag_count_reward/mean": -0.00390625, + "rewards/tag_count_reward/std": 0.06243881583213806, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0231963396072388, + "sampling/importance_sampling_ratio/min": 1.8753072481558775e-06, + "sampling/sampling_logp_difference/max": 13.186738014221191, + "sampling/sampling_logp_difference/mean": 0.07400602847337723, + "step": 565 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0859375, + "entropy": 0.13259483128786087, + "epoch": 1.4894736842105263, + "grad_norm": 0.011994445696473122, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 566 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.13419703394174576, + "epoch": 1.4921052631578946, + "grad_norm": 0.009042926132678986, + "learning_rate": 1e-06, + "loss": 0.0123, + "step": 567 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.13261277973651886, + "epoch": 1.4947368421052631, + "grad_norm": 0.013076446950435638, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7868.0, + "completions/mean_length": 976.060546875, + "completions/mean_terminated_length": 700.3717651367188, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.13073333352804184, + "epoch": 1.4973684210526317, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.017082160338759422, + "learning_rate": 1e-06, + "loss": 0.0381, + "num_tokens": 181120984.0, + "reward": 0.8026704788208008, + "reward_std": 0.17961543798446655, + "rewards/progression_diversity/mean": -0.0015097158029675484, + "rewards/progression_diversity/std": 0.018514791503548622, + "rewards/symbolic_reward_accuracy/mean": 0.876953125, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.9275716543197632, + "rewards/symbolic_reward_partial_score/std": 0.2205788642168045, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0213254690170288, + "sampling/importance_sampling_ratio/min": 2.747718781392905e-07, + "sampling/sampling_logp_difference/max": 15.107324600219727, + "sampling/sampling_logp_difference/mean": 0.07009261846542358, + "step": 569 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.1352020874619484, + "epoch": 1.5, + "grad_norm": 0.02589314803481102, + "learning_rate": 1e-06, + "loss": 0.0211, + "step": 570 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.13306277990341187, + "epoch": 1.5026315789473683, + "grad_norm": 0.018376614898443222, + "learning_rate": 1e-06, + "loss": 0.014, + "step": 571 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.1296408474445343, + "epoch": 1.5052631578947369, + "grad_norm": 0.028880199417471886, + "learning_rate": 1e-06, + "loss": 0.0695, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5029.0, + "completions/mean_length": 1003.607421875, + "completions/mean_terminated_length": 665.9141845703125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.1362362802028656, + "epoch": 1.5078947368421054, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.03721587732434273, + "learning_rate": 1e-06, + "loss": 0.0271, + "num_tokens": 182032623.0, + "reward": 0.8145319223403931, + "reward_std": 0.13320812582969666, + "rewards/progression_diversity/mean": -0.001894364831969142, + "rewards/progression_diversity/std": 0.017769131809473038, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.9371744990348816, + "rewards/symbolic_reward_partial_score/std": 0.2098037451505661, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0225517749786377, + "sampling/importance_sampling_ratio/min": 1.7453003763891715e-15, + "sampling/sampling_logp_difference/max": 33.981849670410156, + "sampling/sampling_logp_difference/mean": 0.07218128442764282, + "step": 573 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.1370982825756073, + "epoch": 1.5105263157894737, + "grad_norm": 0.01768220029771328, + "learning_rate": 1e-06, + "loss": 0.0052, + "step": 574 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.125, + "entropy": 0.1406766176223755, + "epoch": 1.513157894736842, + "grad_norm": 0.025351712480187416, + "learning_rate": 1e-06, + "loss": 0.0384, + "step": 575 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.1350262686610222, + "epoch": 1.5157894736842106, + "grad_norm": 0.012894677929580212, + "learning_rate": 1e-06, + "loss": 0.0346, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6116.0, + "completions/mean_length": 1377.828125, + "completions/mean_terminated_length": 831.0445556640625, + "completions/min_length": 322.0, + "completions/min_terminated_length": 322.0, + "entropy": 0.13445448130369186, + "epoch": 1.518421052631579, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.034472037106752396, + "learning_rate": 1e-06, + "loss": 0.0196, + "num_tokens": 183164919.0, + "reward": 0.7574737071990967, + "reward_std": 0.18111687898635864, + "rewards/progression_diversity/mean": -0.004586691036820412, + "rewards/progression_diversity/std": 0.0347183421254158, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.8916015625, + "rewards/symbolic_reward_partial_score/std": 0.2719910144805908, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0212225914001465, + "sampling/importance_sampling_ratio/min": 3.2291410434481804e-08, + "sampling/sampling_logp_difference/max": 17.248464584350586, + "sampling/sampling_logp_difference/mean": 0.06880679726600647, + "step": 577 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.1360417976975441, + "epoch": 1.5210526315789474, + "grad_norm": 0.01104611437767744, + "learning_rate": 1e-06, + "loss": 0.0152, + "step": 578 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.12674961611628532, + "epoch": 1.5236842105263158, + "grad_norm": 0.045898765325546265, + "learning_rate": 1e-06, + "loss": 0.0459, + "step": 579 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.125128373503685, + "epoch": 1.526315789473684, + "grad_norm": 0.015659412369132042, + "learning_rate": 1e-06, + "loss": 0.0575, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5905.0, + "completions/mean_length": 1173.80859375, + "completions/mean_terminated_length": 714.7484741210938, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.12383318692445755, + "epoch": 1.5289473684210526, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.04507393762469292, + "learning_rate": 1e-06, + "loss": 0.0544, + "num_tokens": 184160693.0, + "reward": 0.8106682300567627, + "reward_std": 0.1318516582250595, + "rewards/progression_diversity/mean": -0.0025172452442348003, + "rewards/progression_diversity/std": 0.021089140325784683, + "rewards/symbolic_reward_accuracy/mean": 0.890625, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.9308268427848816, + "rewards/symbolic_reward_partial_score/std": 0.2209477424621582, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0210371017456055, + "sampling/importance_sampling_ratio/min": 1.1693467028006665e-10, + "sampling/sampling_logp_difference/max": 22.86940574645996, + "sampling/sampling_logp_difference/mean": 0.07237155735492706, + "step": 581 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.12782908231019974, + "epoch": 1.5315789473684212, + "grad_norm": 0.010001985356211662, + "learning_rate": 1e-06, + "loss": 0.0502, + "step": 582 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.12940338253974915, + "epoch": 1.5342105263157895, + "grad_norm": 0.022164426743984222, + "learning_rate": 1e-06, + "loss": 0.0063, + "step": 583 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.1259337067604065, + "epoch": 1.5368421052631578, + "grad_norm": 0.010080978274345398, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5788.0, + "completions/mean_length": 1110.978515625, + "completions/mean_terminated_length": 744.426025390625, + "completions/min_length": 234.0, + "completions/min_terminated_length": 234.0, + "entropy": 0.11713157966732979, + "epoch": 1.5394736842105263, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.034748658537864685, + "learning_rate": 1e-06, + "loss": 0.0692, + "num_tokens": 185156842.0, + "reward": 0.8088133931159973, + "reward_std": 0.16672645509243011, + "rewards/progression_diversity/mean": -0.0024531371891498566, + "rewards/progression_diversity/std": 0.021702419966459274, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9337564706802368, + "rewards/symbolic_reward_partial_score/std": 0.2161990851163864, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.019039511680603, + "sampling/importance_sampling_ratio/min": 1.6557880826439941e-06, + "sampling/sampling_logp_difference/max": 13.311233520507812, + "sampling/sampling_logp_difference/mean": 0.06788427382707596, + "step": 585 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.12097888439893723, + "epoch": 1.5421052631578949, + "grad_norm": 0.02093541994690895, + "learning_rate": 1e-06, + "loss": 0.0109, + "step": 586 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.12494004517793655, + "epoch": 1.5447368421052632, + "grad_norm": 0.012646614573895931, + "learning_rate": 1e-06, + "loss": -0.0019, + "step": 587 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.12453506141901016, + "epoch": 1.5473684210526315, + "grad_norm": 0.020611083135008812, + "learning_rate": 1e-06, + "loss": 0.0183, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4762.0, + "completions/mean_length": 1052.017578125, + "completions/mean_terminated_length": 715.38720703125, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.1217985488474369, + "epoch": 1.55, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.038163021206855774, + "learning_rate": 1e-06, + "loss": 0.0278, + "num_tokens": 186078931.0, + "reward": 0.8041250109672546, + "reward_std": 0.16053986549377441, + "rewards/progression_diversity/mean": -0.002542970236390829, + "rewards/progression_diversity/std": 0.02396521344780922, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9220377206802368, + "rewards/symbolic_reward_partial_score/std": 0.2436649203300476, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0195398330688477, + "sampling/importance_sampling_ratio/min": 1.0385434734416776e-07, + "sampling/sampling_logp_difference/max": 16.080276489257812, + "sampling/sampling_logp_difference/mean": 0.07104349136352539, + "step": 589 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.12670139968395233, + "epoch": 1.5526315789473686, + "grad_norm": 0.02337627112865448, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 590 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.12126578390598297, + "epoch": 1.555263157894737, + "grad_norm": 0.02081996016204357, + "learning_rate": 1e-06, + "loss": 0.0416, + "step": 591 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.121041189879179, + "epoch": 1.5578947368421052, + "grad_norm": 0.014259042218327522, + "learning_rate": 1e-06, + "loss": 0.0228, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6324.0, + "completions/mean_length": 1310.251953125, + "completions/mean_terminated_length": 761.006103515625, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.11463157832622528, + "epoch": 1.5605263157894735, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.03503033518791199, + "learning_rate": 1e-06, + "loss": 0.0623, + "num_tokens": 187157940.0, + "reward": 0.774095892906189, + "reward_std": 0.19663885235786438, + "rewards/progression_diversity/mean": -0.002528228797018528, + "rewards/progression_diversity/std": 0.020892662927508354, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.9150390625, + "rewards/symbolic_reward_partial_score/std": 0.22795869410037994, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0178115367889404, + "sampling/importance_sampling_ratio/min": 1.8364110587754112e-07, + "sampling/sampling_logp_difference/max": 15.510282516479492, + "sampling/sampling_logp_difference/mean": 0.06724664568901062, + "step": 593 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.11785297840833664, + "epoch": 1.563157894736842, + "grad_norm": 0.015631457790732384, + "learning_rate": 1e-06, + "loss": 0.0155, + "step": 594 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.11847185716032982, + "epoch": 1.5657894736842106, + "grad_norm": 0.025584295392036438, + "learning_rate": 1e-06, + "loss": 0.034, + "step": 595 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.12040146440267563, + "epoch": 1.568421052631579, + "grad_norm": 0.024837322533130646, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7156.0, + "completions/mean_length": 1281.55859375, + "completions/mean_terminated_length": 762.888916015625, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.11434166878461838, + "epoch": 1.5710526315789473, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.031757909804582596, + "learning_rate": 1e-06, + "loss": 0.0399, + "num_tokens": 188220754.0, + "reward": 0.7410222291946411, + "reward_std": 0.2251667082309723, + "rewards/progression_diversity/mean": -0.004226570948958397, + "rewards/progression_diversity/std": 0.034974951297044754, + "rewards/symbolic_reward_accuracy/mean": 0.8046875, + "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, + "rewards/symbolic_reward_partial_score/mean": 0.8719075322151184, + "rewards/symbolic_reward_partial_score/std": 0.29337501525878906, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.018049955368042, + "sampling/importance_sampling_ratio/min": 1.5532393717876403e-06, + "sampling/sampling_logp_difference/max": 13.375167846679688, + "sampling/sampling_logp_difference/mean": 0.06641073524951935, + "step": 597 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.1158062033355236, + "epoch": 1.5736842105263158, + "grad_norm": 0.05890418589115143, + "learning_rate": 1e-06, + "loss": 0.0376, + "step": 598 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.11098961159586906, + "epoch": 1.5763157894736843, + "grad_norm": 0.0232480950653553, + "learning_rate": 1e-06, + "loss": 0.042, + "step": 599 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.11672716960310936, + "epoch": 1.5789473684210527, + "grad_norm": 0.019831934943795204, + "learning_rate": 1e-06, + "loss": 0.0268, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6496.0, + "completions/mean_length": 1332.814453125, + "completions/mean_terminated_length": 752.7484741210938, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.11477554962038994, + "epoch": 1.581578947368421, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.03628187254071236, + "learning_rate": 1e-06, + "loss": 0.018, + "num_tokens": 189330323.0, + "reward": 0.7476116418838501, + "reward_std": 0.23416918516159058, + "rewards/progression_diversity/mean": -0.004457623697817326, + "rewards/progression_diversity/std": 0.032657936215400696, + "rewards/symbolic_reward_accuracy/mean": 0.810546875, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.8834635019302368, + "rewards/symbolic_reward_partial_score/std": 0.28327476978302, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0170378684997559, + "sampling/importance_sampling_ratio/min": 2.704789210383751e-07, + "sampling/sampling_logp_difference/max": 15.123071670532227, + "sampling/sampling_logp_difference/mean": 0.06537644565105438, + "step": 601 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.11666785180568695, + "epoch": 1.5842105263157895, + "grad_norm": 0.03475677967071533, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 602 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.11766305565834045, + "epoch": 1.586842105263158, + "grad_norm": 0.016544286161661148, + "learning_rate": 1e-06, + "loss": 0.0143, + "step": 603 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.11409174650907516, + "epoch": 1.5894736842105264, + "grad_norm": 0.04165460914373398, + "learning_rate": 1e-06, + "loss": 0.0841, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7985.0, + "completions/mean_length": 1443.87890625, + "completions/mean_terminated_length": 773.097900390625, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.11511170864105225, + "epoch": 1.5921052631578947, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.030341695994138718, + "learning_rate": 1e-06, + "loss": 0.0233, + "num_tokens": 190460885.0, + "reward": 0.7671371698379517, + "reward_std": 0.20124712586402893, + "rewards/progression_diversity/mean": -0.005033410154283047, + "rewards/progression_diversity/std": 0.03523511067032814, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.8912760615348816, + "rewards/symbolic_reward_partial_score/std": 0.28599029779434204, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0175724029541016, + "sampling/importance_sampling_ratio/min": 5.44225827070477e-07, + "sampling/sampling_logp_difference/max": 14.423901557922363, + "sampling/sampling_logp_difference/mean": 0.06623612344264984, + "step": 605 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.11644968762993813, + "epoch": 1.594736842105263, + "grad_norm": 0.019640203565359116, + "learning_rate": 1e-06, + "loss": 0.0203, + "step": 606 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.11942460760474205, + "epoch": 1.5973684210526315, + "grad_norm": 0.018807774409651756, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 607 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.11645659804344177, + "epoch": 1.6, + "grad_norm": 0.018924150615930557, + "learning_rate": 1e-06, + "loss": 0.044, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6067.0, + "completions/mean_length": 829.654296875, + "completions/mean_terminated_length": 737.9783935546875, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.12432442978024483, + "epoch": 1.6026315789473684, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.025178352370858192, + "learning_rate": 1e-06, + "loss": 0.0161, + "num_tokens": 191283460.0, + "reward": 0.8110204339027405, + "reward_std": 0.1572936773300171, + "rewards/progression_diversity/mean": -0.001474375487305224, + "rewards/progression_diversity/std": 0.02059447206556797, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9358724355697632, + "rewards/symbolic_reward_partial_score/std": 0.20960327982902527, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0208200216293335, + "sampling/importance_sampling_ratio/min": 8.66128682305932e-22, + "sampling/sampling_logp_difference/max": 48.498008728027344, + "sampling/sampling_logp_difference/mean": 0.07230201363563538, + "step": 609 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.12939514964818954, + "epoch": 1.6052631578947367, + "grad_norm": 0.012045629322528839, + "learning_rate": 1e-06, + "loss": 0.0036, + "step": 610 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.12374605610966682, + "epoch": 1.6078947368421053, + "grad_norm": 0.019549217075109482, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 611 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.12776687741279602, + "epoch": 1.6105263157894738, + "grad_norm": 0.014186064712703228, + "learning_rate": 1e-06, + "loss": 0.0144, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5412.0, + "completions/mean_length": 941.5625, + "completions/mean_terminated_length": 758.4506225585938, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.12887892127037048, + "epoch": 1.6131578947368421, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.04177960380911827, + "learning_rate": 1e-06, + "loss": 0.0415, + "num_tokens": 192155012.0, + "reward": 0.7932851314544678, + "reward_std": 0.19080910086631775, + "rewards/progression_diversity/mean": -0.0025479630567133427, + "rewards/progression_diversity/std": 0.03208750858902931, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.90478515625, + "rewards/symbolic_reward_partial_score/std": 0.272712379693985, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0208125114440918, + "sampling/importance_sampling_ratio/min": 1.5786597487021936e-06, + "sampling/sampling_logp_difference/max": 13.35893440246582, + "sampling/sampling_logp_difference/mean": 0.07315555959939957, + "step": 613 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.1350940614938736, + "epoch": 1.6157894736842104, + "grad_norm": 0.012729027308523655, + "learning_rate": 1e-06, + "loss": 0.0034, + "step": 614 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.12994906306266785, + "epoch": 1.618421052631579, + "grad_norm": 0.02224569581449032, + "learning_rate": 1e-06, + "loss": 0.0451, + "step": 615 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.13485563546419144, + "epoch": 1.6210526315789475, + "grad_norm": 0.023328816518187523, + "learning_rate": 1e-06, + "loss": 0.0112, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4567.0, + "completions/mean_length": 1037.134765625, + "completions/mean_terminated_length": 700.1776733398438, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.13107465207576752, + "epoch": 1.6236842105263158, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.03819314390420914, + "learning_rate": 1e-06, + "loss": 0.0452, + "num_tokens": 193093321.0, + "reward": 0.790679931640625, + "reward_std": 0.1968342363834381, + "rewards/progression_diversity/mean": -0.004279204178601503, + "rewards/progression_diversity/std": 0.03591744601726532, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.9202473759651184, + "rewards/symbolic_reward_partial_score/std": 0.24266402423381805, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0211315155029297, + "sampling/importance_sampling_ratio/min": 5.997556399961468e-06, + "sampling/sampling_logp_difference/max": 12.024158477783203, + "sampling/sampling_logp_difference/mean": 0.07138025015592575, + "step": 617 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.13448263704776764, + "epoch": 1.6263157894736842, + "grad_norm": 0.0195352453738451, + "learning_rate": 1e-06, + "loss": 0.0212, + "step": 618 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.12872284650802612, + "epoch": 1.6289473684210525, + "grad_norm": 0.035076647996902466, + "learning_rate": 1e-06, + "loss": 0.0472, + "step": 619 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.12851262837648392, + "epoch": 1.631578947368421, + "grad_norm": 0.017521562054753304, + "learning_rate": 1e-06, + "loss": 0.0411, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5643.0, + "completions/mean_length": 1071.30078125, + "completions/mean_terminated_length": 766.2669677734375, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "entropy": 0.12900108844041824, + "epoch": 1.6342105263157896, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.03199876844882965, + "learning_rate": 1e-06, + "loss": 0.0156, + "num_tokens": 194057987.0, + "reward": 0.7672144174575806, + "reward_std": 0.19012746214866638, + "rewards/progression_diversity/mean": -0.002193653956055641, + "rewards/progression_diversity/std": 0.022286431863904, + "rewards/symbolic_reward_accuracy/mean": 0.830078125, + "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, + "rewards/symbolic_reward_partial_score/mean": 0.9031575918197632, + "rewards/symbolic_reward_partial_score/std": 0.2527180016040802, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0220717191696167, + "sampling/importance_sampling_ratio/min": 3.4926625480657947e-10, + "sampling/sampling_logp_difference/max": 21.77518653869629, + "sampling/sampling_logp_difference/mean": 0.0735945776104927, + "step": 621 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.13501236587762833, + "epoch": 1.6368421052631579, + "grad_norm": 0.023030543699860573, + "learning_rate": 1e-06, + "loss": -0.0017, + "step": 622 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.13038154691457748, + "epoch": 1.6394736842105262, + "grad_norm": 0.020995106548070908, + "learning_rate": 1e-06, + "loss": 0.0482, + "step": 623 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.13032087683677673, + "epoch": 1.6421052631578947, + "grad_norm": 0.01794985868036747, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4610.0, + "completions/mean_length": 1164.958984375, + "completions/mean_terminated_length": 578.4239501953125, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.13446198403835297, + "epoch": 1.6447368421052633, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02450123056769371, + "learning_rate": 1e-06, + "loss": 0.0266, + "num_tokens": 195026894.0, + "reward": 0.8016812801361084, + "reward_std": 0.14666664600372314, + "rewards/progression_diversity/mean": -0.007652563974261284, + "rewards/progression_diversity/std": 0.04430336132645607, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.9231771230697632, + "rewards/symbolic_reward_partial_score/std": 0.2403518557548523, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.02314293384552, + "sampling/importance_sampling_ratio/min": 3.1002757168607786e-05, + "sampling/sampling_logp_difference/max": 10.381434440612793, + "sampling/sampling_logp_difference/mean": 0.07610031962394714, + "step": 625 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.13796798884868622, + "epoch": 1.6473684210526316, + "grad_norm": 0.03212736174464226, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 626 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.13732363283634186, + "epoch": 1.65, + "grad_norm": 0.03413732722401619, + "learning_rate": 1e-06, + "loss": 0.0207, + "step": 627 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.1372433453798294, + "epoch": 1.6526315789473685, + "grad_norm": 0.029417581856250763, + "learning_rate": 1e-06, + "loss": 0.0398, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5332.0, + "completions/mean_length": 1251.904296875, + "completions/mean_terminated_length": 636.7784423828125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.13081193715333939, + "epoch": 1.655263157894737, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.03926420584321022, + "learning_rate": 1e-06, + "loss": 0.086, + "num_tokens": 196044989.0, + "reward": 0.7881196737289429, + "reward_std": 0.17913874983787537, + "rewards/progression_diversity/mean": -0.00639567244797945, + "rewards/progression_diversity/std": 0.04198707640171051, + "rewards/symbolic_reward_accuracy/mean": 0.865234375, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.9085286855697632, + "rewards/symbolic_reward_partial_score/std": 0.2608047127723694, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.023348331451416, + "sampling/importance_sampling_ratio/min": 1.0149918125534896e-05, + "sampling/sampling_logp_difference/max": 11.498044967651367, + "sampling/sampling_logp_difference/mean": 0.07550258934497833, + "step": 629 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.138485848903656, + "epoch": 1.6578947368421053, + "grad_norm": 0.01309316884726286, + "learning_rate": 1e-06, + "loss": 0.0257, + "step": 630 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.13896264880895615, + "epoch": 1.6605263157894736, + "grad_norm": 0.012009157799184322, + "learning_rate": 1e-06, + "loss": 0.0538, + "step": 631 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.1417469009757042, + "epoch": 1.663157894736842, + "grad_norm": 0.024768110364675522, + "learning_rate": 1e-06, + "loss": 0.0391, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6066.0, + "completions/mean_length": 731.333984375, + "completions/mean_terminated_length": 639.07861328125, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.14609570801258087, + "epoch": 1.6657894736842105, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.017739422619342804, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 196802888.0, + "reward": 0.8241581916809082, + "reward_std": 0.1724630892276764, + "rewards/progression_diversity/mean": -0.0011804921086877584, + "rewards/progression_diversity/std": 0.019015712663531303, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.9327799677848816, + "rewards/symbolic_reward_partial_score/std": 0.2337876856327057, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0246440172195435, + "sampling/importance_sampling_ratio/min": 1.7091028894355986e-06, + "sampling/sampling_logp_difference/max": 13.279541969299316, + "sampling/sampling_logp_difference/mean": 0.08123795688152313, + "step": 633 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.14178509265184402, + "epoch": 1.668421052631579, + "grad_norm": 0.027597403153777122, + "learning_rate": 1e-06, + "loss": 0.0282, + "step": 634 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.14551638811826706, + "epoch": 1.6710526315789473, + "grad_norm": 0.006523482967168093, + "learning_rate": 1e-06, + "loss": 0.0247, + "step": 635 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.1471918523311615, + "epoch": 1.6736842105263157, + "grad_norm": 0.019637571647763252, + "learning_rate": 1e-06, + "loss": 0.0191, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2661.0, + "completions/mean_length": 711.6328125, + "completions/mean_terminated_length": 588.2283325195312, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.15460152179002762, + "epoch": 1.6763157894736842, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.016574524343013763, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 197560204.0, + "reward": 0.855990469455719, + "reward_std": 0.12720367312431335, + "rewards/progression_diversity/mean": -0.0015454285312443972, + "rewards/progression_diversity/std": 0.024757709354162216, + "rewards/symbolic_reward_accuracy/mean": 0.943359375, + "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, + "rewards/symbolic_reward_partial_score/mean": 0.9685872793197632, + "rewards/symbolic_reward_partial_score/std": 0.14984627068042755, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0268505811691284, + "sampling/importance_sampling_ratio/min": 1.079319758900965e-06, + "sampling/sampling_logp_difference/max": 13.739179611206055, + "sampling/sampling_logp_difference/mean": 0.08289426565170288, + "step": 637 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.15232256054878235, + "epoch": 1.6789473684210527, + "grad_norm": 0.00705451937392354, + "learning_rate": 1e-06, + "loss": -0.0044, + "step": 638 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.1456858143210411, + "epoch": 1.681578947368421, + "grad_norm": 0.025568673387169838, + "learning_rate": 1e-06, + "loss": 0.0668, + "step": 639 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.1482095867395401, + "epoch": 1.6842105263157894, + "grad_norm": 0.02351662516593933, + "learning_rate": 1e-06, + "loss": 0.0198, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5108.0, + "completions/mean_length": 814.15234375, + "completions/mean_terminated_length": 629.5296630859375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.14524078369140625, + "epoch": 1.686842105263158, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.02617044374346733, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 198374842.0, + "reward": 0.8252705335617065, + "reward_std": 0.16813138127326965, + "rewards/progression_diversity/mean": -0.0022450610995292664, + "rewards/progression_diversity/std": 0.032739993184804916, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9488931894302368, + "rewards/symbolic_reward_partial_score/std": 0.18802446126937866, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0245275497436523, + "sampling/importance_sampling_ratio/min": 5.705417915891076e-09, + "sampling/sampling_logp_difference/max": 18.981849670410156, + "sampling/sampling_logp_difference/mean": 0.07951345294713974, + "step": 641 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.1453712061047554, + "epoch": 1.6894736842105265, + "grad_norm": 0.030677122995257378, + "learning_rate": 1e-06, + "loss": 0.0483, + "step": 642 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.14691882580518723, + "epoch": 1.6921052631578948, + "grad_norm": 0.009873943403363228, + "learning_rate": 1e-06, + "loss": -0.0022, + "step": 643 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.14735910296440125, + "epoch": 1.694736842105263, + "grad_norm": 0.017524579539895058, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4121.0, + "completions/mean_length": 1686.984375, + "completions/mean_terminated_length": 739.7754516601562, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.14734282344579697, + "epoch": 1.6973684210526314, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.03577494993805885, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 199650066.0, + "reward": 0.7072658538818359, + "reward_std": 0.2004225254058838, + "rewards/progression_diversity/mean": -0.010720351710915565, + "rewards/progression_diversity/std": 0.056672148406505585, + "rewards/symbolic_reward_accuracy/mean": 0.763671875, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.85009765625, + "rewards/symbolic_reward_partial_score/std": 0.32125625014305115, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0221692323684692, + "sampling/importance_sampling_ratio/min": 1.0171705753236893e-06, + "sampling/sampling_logp_difference/max": 13.79848575592041, + "sampling/sampling_logp_difference/mean": 0.06588836014270782, + "step": 645 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.13180600851774216, + "epoch": 1.7, + "grad_norm": 0.016572780907154083, + "learning_rate": 1e-06, + "loss": 0.0562, + "step": 646 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.13006508350372314, + "epoch": 1.7026315789473685, + "grad_norm": 0.024881578981876373, + "learning_rate": 1e-06, + "loss": 0.1149, + "step": 647 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.13194820284843445, + "epoch": 1.7052631578947368, + "grad_norm": 0.017025984823703766, + "learning_rate": 1e-06, + "loss": 0.0827, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13227.0, + "completions/mean_length": 966.30859375, + "completions/mean_terminated_length": 721.5833740234375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.15003421157598495, + "epoch": 1.7078947368421051, + "frac_reward_zero_std": 0.34375, + "grad_norm": 0.026492591947317123, + "learning_rate": 1e-06, + "loss": 0.0272, + "num_tokens": 200547280.0, + "reward": 0.7986052632331848, + "reward_std": 0.18857163190841675, + "rewards/progression_diversity/mean": -0.0027552107349038124, + "rewards/progression_diversity/std": 0.026247508823871613, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.9153645634651184, + "rewards/symbolic_reward_partial_score/std": 0.25720417499542236, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.025015115737915, + "sampling/importance_sampling_ratio/min": 3.4338822274548875e-07, + "sampling/sampling_logp_difference/max": 14.884404182434082, + "sampling/sampling_logp_difference/mean": 0.07899387180805206, + "step": 649 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.15113919973373413, + "epoch": 1.7105263157894737, + "grad_norm": 0.029967421665787697, + "learning_rate": 1e-06, + "loss": 0.04, + "step": 650 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.14088765531778336, + "epoch": 1.7131578947368422, + "grad_norm": 0.04313409700989723, + "learning_rate": 1e-06, + "loss": 0.0421, + "step": 651 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.14788048714399338, + "epoch": 1.7157894736842105, + "grad_norm": 0.014722016640007496, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4130.0, + "completions/mean_length": 835.263671875, + "completions/mean_terminated_length": 619.7366333007812, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.14688391238451004, + "epoch": 1.7184210526315788, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.025073617696762085, + "learning_rate": 1e-06, + "loss": 0.0165, + "num_tokens": 201358551.0, + "reward": 0.7871145606040955, + "reward_std": 0.2015751600265503, + "rewards/progression_diversity/mean": -0.004367514047771692, + "rewards/progression_diversity/std": 0.043641433119773865, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.92333984375, + "rewards/symbolic_reward_partial_score/std": 0.23156411945819855, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0254812240600586, + "sampling/importance_sampling_ratio/min": 2.4464074429436655e-12, + "sampling/sampling_logp_difference/max": 26.736400604248047, + "sampling/sampling_logp_difference/mean": 0.08250489830970764, + "step": 653 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.14983681589365005, + "epoch": 1.7210526315789474, + "grad_norm": 0.03017502650618553, + "learning_rate": 1e-06, + "loss": 0.0162, + "step": 654 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.14353607594966888, + "epoch": 1.723684210526316, + "grad_norm": 0.011741744354367256, + "learning_rate": 1e-06, + "loss": 0.0456, + "step": 655 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.14645575731992722, + "epoch": 1.7263157894736842, + "grad_norm": 0.0143356928601861, + "learning_rate": 1e-06, + "loss": 0.0101, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4858.0, + "completions/mean_length": 808.5078125, + "completions/mean_terminated_length": 623.8182373046875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.14062481373548508, + "epoch": 1.7289473684210526, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.03301079943776131, + "learning_rate": 1e-06, + "loss": 0.0232, + "num_tokens": 202161115.0, + "reward": 0.7795208692550659, + "reward_std": 0.21248871088027954, + "rewards/progression_diversity/mean": -0.0020201844163239002, + "rewards/progression_diversity/std": 0.022819824516773224, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.8953450322151184, + "rewards/symbolic_reward_partial_score/std": 0.28262069821357727, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0237058401107788, + "sampling/importance_sampling_ratio/min": 4.2048032611319286e-08, + "sampling/sampling_logp_difference/max": 16.984453201293945, + "sampling/sampling_logp_difference/mean": 0.07847477495670319, + "step": 657 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.14044133573770523, + "epoch": 1.731578947368421, + "grad_norm": 0.013989591039717197, + "learning_rate": 1e-06, + "loss": 0.0237, + "step": 658 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.1442088633775711, + "epoch": 1.7342105263157894, + "grad_norm": 0.014133965596556664, + "learning_rate": 1e-06, + "loss": 0.0343, + "step": 659 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.14218034595251083, + "epoch": 1.736842105263158, + "grad_norm": 0.01604565605521202, + "learning_rate": 1e-06, + "loss": 0.0263, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.005859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7880.0, + "completions/mean_length": 708.55078125, + "completions/mean_terminated_length": 616.1611328125, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.14530527591705322, + "epoch": 1.7394736842105263, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.022363541647791862, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 202912181.0, + "reward": 0.8098558187484741, + "reward_std": 0.1898079812526703, + "rewards/progression_diversity/mean": -0.0007470193086192012, + "rewards/progression_diversity/std": 0.011871008202433586, + "rewards/symbolic_reward_accuracy/mean": 0.888671875, + "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, + "rewards/symbolic_reward_partial_score/mean": 0.9241536855697632, + "rewards/symbolic_reward_partial_score/std": 0.24054944515228271, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0253888368606567, + "sampling/importance_sampling_ratio/min": 3.482146238020789e-10, + "sampling/sampling_logp_difference/max": 21.778202056884766, + "sampling/sampling_logp_difference/mean": 0.08183024823665619, + "step": 661 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.1518562138080597, + "epoch": 1.7421052631578946, + "grad_norm": 0.020019719377160072, + "learning_rate": 1e-06, + "loss": 0.0076, + "step": 662 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.14416959136724472, + "epoch": 1.7447368421052631, + "grad_norm": 0.04639910161495209, + "learning_rate": 1e-06, + "loss": 0.0314, + "step": 663 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.1428115963935852, + "epoch": 1.7473684210526317, + "grad_norm": 0.011411590501666069, + "learning_rate": 1e-06, + "loss": 0.0255, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8516.0, + "completions/mean_length": 1431.109375, + "completions/mean_terminated_length": 727.8036499023438, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.13438043743371964, + "epoch": 1.75, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.026513205841183662, + "learning_rate": 1e-06, + "loss": 0.0408, + "num_tokens": 204042541.0, + "reward": 0.6865507364273071, + "reward_std": 0.19577746093273163, + "rewards/progression_diversity/mean": -0.007034477777779102, + "rewards/progression_diversity/std": 0.0463702417910099, + "rewards/symbolic_reward_accuracy/mean": 0.732421875, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.8362630605697632, + "rewards/symbolic_reward_partial_score/std": 0.3250221014022827, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.02474045753479, + "sampling/importance_sampling_ratio/min": 1.1017795123480725e-17, + "sampling/sampling_logp_difference/max": 39.047019958496094, + "sampling/sampling_logp_difference/mean": 0.07735016196966171, + "step": 665 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.13546551764011383, + "epoch": 1.7526315789473683, + "grad_norm": 0.0192271675914526, + "learning_rate": 1e-06, + "loss": 0.0348, + "step": 666 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.13766096532344818, + "epoch": 1.7552631578947369, + "grad_norm": 0.017237944528460503, + "learning_rate": 1e-06, + "loss": 0.0368, + "step": 667 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.13993556797504425, + "epoch": 1.7578947368421054, + "grad_norm": 0.02463732473552227, + "learning_rate": 1e-06, + "loss": 0.0289, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8065.0, + "completions/mean_length": 977.833984375, + "completions/mean_terminated_length": 670.9382934570312, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.14678215980529785, + "epoch": 1.7605263157894737, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.023546183481812477, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 204941848.0, + "reward": 0.7797601222991943, + "reward_std": 0.18229469656944275, + "rewards/progression_diversity/mean": -0.002507382072508335, + "rewards/progression_diversity/std": 0.025364823639392853, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.9091796875, + "rewards/symbolic_reward_partial_score/std": 0.2537998855113983, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0251445770263672, + "sampling/importance_sampling_ratio/min": 1.523035741968215e-08, + "sampling/sampling_logp_difference/max": 17.999975204467773, + "sampling/sampling_logp_difference/mean": 0.0789584219455719, + "step": 669 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.1442192792892456, + "epoch": 1.763157894736842, + "grad_norm": 0.020266570150852203, + "learning_rate": 1e-06, + "loss": 0.0654, + "step": 670 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.14410430938005447, + "epoch": 1.7657894736842106, + "grad_norm": 0.00926806777715683, + "learning_rate": 1e-06, + "loss": 0.0179, + "step": 671 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.14117338508367538, + "epoch": 1.768421052631579, + "grad_norm": 0.008366767317056656, + "learning_rate": 1e-06, + "loss": 0.0513, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3605.0, + "completions/mean_length": 996.298828125, + "completions/mean_terminated_length": 658.4451293945312, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.14415088295936584, + "epoch": 1.7710526315789474, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.023884868249297142, + "learning_rate": 1e-06, + "loss": 0.0597, + "num_tokens": 205849745.0, + "reward": 0.8204151391983032, + "reward_std": 0.13469088077545166, + "rewards/progression_diversity/mean": -0.0043905326165258884, + "rewards/progression_diversity/std": 0.046648427844047546, + "rewards/symbolic_reward_accuracy/mean": 0.8984375, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.9431966543197632, + "rewards/symbolic_reward_partial_score/std": 0.2042941004037857, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0248339176177979, + "sampling/importance_sampling_ratio/min": 3.4787309033390557e-08, + "sampling/sampling_logp_difference/max": 17.174013137817383, + "sampling/sampling_logp_difference/mean": 0.07455458492040634, + "step": 673 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.13898008316755295, + "epoch": 1.7736842105263158, + "grad_norm": 0.029918354004621506, + "learning_rate": 1e-06, + "loss": 0.0328, + "step": 674 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.14073319733142853, + "epoch": 1.776315789473684, + "grad_norm": 0.01222238689661026, + "learning_rate": 1e-06, + "loss": 0.0455, + "step": 675 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.13992726802825928, + "epoch": 1.7789473684210526, + "grad_norm": 0.00996064767241478, + "learning_rate": 1e-06, + "loss": 0.046, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 6681.0, + "completions/mean_length": 1095.326171875, + "completions/mean_terminated_length": 728.3980102539062, + "completions/min_length": 263.0, + "completions/min_terminated_length": 263.0, + "entropy": 0.14075642824172974, + "epoch": 1.7815789473684212, + "frac_reward_zero_std": 0.21875, + "grad_norm": 0.03422519192099571, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 206820696.0, + "reward": 0.7328424453735352, + "reward_std": 0.20037183165550232, + "rewards/progression_diversity/mean": -0.0018944772891700268, + "rewards/progression_diversity/std": 0.018759803846478462, + "rewards/symbolic_reward_accuracy/mean": 0.783203125, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.8836262822151184, + "rewards/symbolic_reward_partial_score/std": 0.27410218119621277, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0241965055465698, + "sampling/importance_sampling_ratio/min": 8.016753927364562e-11, + "sampling/sampling_logp_difference/max": 23.246902465820312, + "sampling/sampling_logp_difference/mean": 0.07802345603704453, + "step": 677 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.1405147835612297, + "epoch": 1.7842105263157895, + "grad_norm": 0.01229146309196949, + "learning_rate": 1e-06, + "loss": 0.0232, + "step": 678 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.13932440429925919, + "epoch": 1.7868421052631578, + "grad_norm": 0.01690313220024109, + "learning_rate": 1e-06, + "loss": 0.0334, + "step": 679 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.14134883880615234, + "epoch": 1.7894736842105263, + "grad_norm": 0.03306613117456436, + "learning_rate": 1e-06, + "loss": 0.0357, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14676.0, + "completions/mean_length": 1426.642578125, + "completions/mean_terminated_length": 755.0877075195312, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.13857001811265945, + "epoch": 1.7921052631578949, + "frac_reward_zero_std": 0.40625, + "grad_norm": 0.03711333125829697, + "learning_rate": 1e-06, + "loss": 0.0339, + "num_tokens": 207951297.0, + "reward": 0.7199710607528687, + "reward_std": 0.15880054235458374, + "rewards/progression_diversity/mean": -0.00485343299806118, + "rewards/progression_diversity/std": 0.03532535955309868, + "rewards/symbolic_reward_accuracy/mean": 0.775390625, + "rewards/symbolic_reward_accuracy/std": 0.41773295402526855, + "rewards/symbolic_reward_partial_score/mean": 0.8616536855697632, + "rewards/symbolic_reward_partial_score/std": 0.30703648924827576, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0234718322753906, + "sampling/importance_sampling_ratio/min": 3.384801416927985e-08, + "sampling/sampling_logp_difference/max": 17.201385498046875, + "sampling/sampling_logp_difference/mean": 0.07443863153457642, + "step": 681 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.13849762827157974, + "epoch": 1.7947368421052632, + "grad_norm": 0.017198480665683746, + "learning_rate": 1e-06, + "loss": 0.0603, + "step": 682 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.13620536029338837, + "epoch": 1.7973684210526315, + "grad_norm": 0.024120958521962166, + "learning_rate": 1e-06, + "loss": 0.0262, + "step": 683 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.13732768595218658, + "epoch": 1.8, + "grad_norm": 0.031364087015390396, + "learning_rate": 1e-06, + "loss": 0.0579, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.013671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5863.0, + "completions/mean_length": 877.501953125, + "completions/mean_terminated_length": 662.5604248046875, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.147243432700634, + "epoch": 1.8026315789473686, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.018228838220238686, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 208811746.0, + "reward": 0.7966721653938293, + "reward_std": 0.1374010145664215, + "rewards/progression_diversity/mean": -0.0007563849212601781, + "rewards/progression_diversity/std": 0.012015795335173607, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.9166666269302368, + "rewards/symbolic_reward_partial_score/std": 0.2510036528110504, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0271267890930176, + "sampling/importance_sampling_ratio/min": 4.275814262655331e-07, + "sampling/sampling_logp_difference/max": 14.665121078491211, + "sampling/sampling_logp_difference/mean": 0.08050627261400223, + "step": 685 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.14790859073400497, + "epoch": 1.805263157894737, + "grad_norm": 0.019763531163334846, + "learning_rate": 1e-06, + "loss": 0.0097, + "step": 686 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.15091699361801147, + "epoch": 1.8078947368421052, + "grad_norm": 0.018503857776522636, + "learning_rate": 1e-06, + "loss": 0.0235, + "step": 687 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.15277040749788284, + "epoch": 1.8105263157894735, + "grad_norm": 0.014150720089673996, + "learning_rate": 1e-06, + "loss": 0.0199, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9039.0, + "completions/mean_length": 803.11328125, + "completions/mean_terminated_length": 649.4556274414062, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.16078810393810272, + "epoch": 1.813157894736842, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.019453125074505806, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 209626588.0, + "reward": 0.838794469833374, + "reward_std": 0.13540925085544586, + "rewards/progression_diversity/mean": -0.0023899937514215708, + "rewards/progression_diversity/std": 0.026318540796637535, + "rewards/symbolic_reward_accuracy/mean": 0.919921875, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.9581705927848816, + "rewards/symbolic_reward_partial_score/std": 0.17204301059246063, + "rewards/tag_count_reward/mean": -0.005859375, + "rewards/tag_count_reward/std": 0.07639661431312561, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0289819240570068, + "sampling/importance_sampling_ratio/min": 5.673148734786082e-06, + "sampling/sampling_logp_difference/max": 12.079766273498535, + "sampling/sampling_logp_difference/mean": 0.08512907475233078, + "step": 689 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.15033067017793655, + "epoch": 1.8157894736842106, + "grad_norm": 0.053703587502241135, + "learning_rate": 1e-06, + "loss": 0.028, + "step": 690 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.16108743101358414, + "epoch": 1.818421052631579, + "grad_norm": 0.012639776803553104, + "learning_rate": 1e-06, + "loss": 0.0456, + "step": 691 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1015625, + "entropy": 0.1631702333688736, + "epoch": 1.8210526315789473, + "grad_norm": 0.009674533270299435, + "learning_rate": 1e-06, + "loss": 0.0075, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13431.0, + "completions/mean_length": 1337.83984375, + "completions/mean_terminated_length": 726.207275390625, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.15106821060180664, + "epoch": 1.8236842105263158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.027334807440638542, + "learning_rate": 1e-06, + "loss": 0.0295, + "num_tokens": 210713706.0, + "reward": 0.7495685815811157, + "reward_std": 0.16618216037750244, + "rewards/progression_diversity/mean": -0.0040826634503901005, + "rewards/progression_diversity/std": 0.031112631782889366, + "rewards/symbolic_reward_accuracy/mean": 0.8203125, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.8671875, + "rewards/symbolic_reward_partial_score/std": 0.31214237213134766, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0262157917022705, + "sampling/importance_sampling_ratio/min": 1.5291009569651237e-09, + "sampling/sampling_logp_difference/max": 20.298585891723633, + "sampling/sampling_logp_difference/mean": 0.07850104570388794, + "step": 693 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.15497365593910217, + "epoch": 1.8263157894736843, + "grad_norm": 0.018270188942551613, + "learning_rate": 1e-06, + "loss": 0.0111, + "step": 694 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.1553620547056198, + "epoch": 1.8289473684210527, + "grad_norm": 0.009259329177439213, + "learning_rate": 1e-06, + "loss": 0.0377, + "step": 695 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.14360713958740234, + "epoch": 1.831578947368421, + "grad_norm": 0.010323197580873966, + "learning_rate": 1e-06, + "loss": 0.0499, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11742.0, + "completions/mean_length": 1135.07421875, + "completions/mean_terminated_length": 674.8450317382812, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.1573568657040596, + "epoch": 1.8342105263157895, + "frac_reward_zero_std": 0.28125, + "grad_norm": 0.03801272064447403, + "learning_rate": 1e-06, + "loss": 0.0393, + "num_tokens": 211690672.0, + "reward": 0.7663776874542236, + "reward_std": 0.2096462845802307, + "rewards/progression_diversity/mean": -0.007742568850517273, + "rewards/progression_diversity/std": 0.055101774632930756, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.88818359375, + "rewards/symbolic_reward_partial_score/std": 0.2851507365703583, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0286474227905273, + "sampling/importance_sampling_ratio/min": 1.8554568725903664e-07, + "sampling/sampling_logp_difference/max": 15.499964714050293, + "sampling/sampling_logp_difference/mean": 0.08280766010284424, + "step": 697 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.15686368942260742, + "epoch": 1.836842105263158, + "grad_norm": 0.022794852033257484, + "learning_rate": 1e-06, + "loss": 0.0325, + "step": 698 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.1601133495569229, + "epoch": 1.8394736842105264, + "grad_norm": 0.009969050996005535, + "learning_rate": 1e-06, + "loss": 0.0541, + "step": 699 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.16635262221097946, + "epoch": 1.8421052631578947, + "grad_norm": 0.016408884897828102, + "learning_rate": 1e-06, + "loss": 0.0137, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7094.0, + "completions/mean_length": 898.439453125, + "completions/mean_terminated_length": 621.36181640625, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.16016201674938202, + "epoch": 1.844736842105263, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.06322816014289856, + "learning_rate": 1e-06, + "loss": 0.0253, + "num_tokens": 212557809.0, + "reward": 0.7900543212890625, + "reward_std": 0.2123447060585022, + "rewards/progression_diversity/mean": -0.003359610214829445, + "rewards/progression_diversity/std": 0.036421939730644226, + "rewards/symbolic_reward_accuracy/mean": 0.865234375, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.9064127206802368, + "rewards/symbolic_reward_partial_score/std": 0.2696720063686371, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0269652605056763, + "sampling/importance_sampling_ratio/min": 2.2314340597517912e-08, + "sampling/sampling_logp_difference/max": 17.6180362701416, + "sampling/sampling_logp_difference/mean": 0.0807926282286644, + "step": 701 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.15802810341119766, + "epoch": 1.8473684210526315, + "grad_norm": 0.015205918811261654, + "learning_rate": 1e-06, + "loss": 0.0352, + "step": 702 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.16626133769750595, + "epoch": 1.85, + "grad_norm": 0.01964223012328148, + "learning_rate": 1e-06, + "loss": 0.045, + "step": 703 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.1609407141804695, + "epoch": 1.8526315789473684, + "grad_norm": 0.01399518083781004, + "learning_rate": 1e-06, + "loss": 0.0803, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11244.0, + "completions/mean_length": 1886.1796875, + "completions/mean_terminated_length": 657.5508422851562, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.15396300703287125, + "epoch": 1.8552631578947367, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.04543311893939972, + "learning_rate": 1e-06, + "loss": 0.0924, + "num_tokens": 213938029.0, + "reward": 0.7416077852249146, + "reward_std": 0.23779910802841187, + "rewards/progression_diversity/mean": -0.014027466997504234, + "rewards/progression_diversity/std": 0.06849034875631332, + "rewards/symbolic_reward_accuracy/mean": 0.80859375, + "rewards/symbolic_reward_accuracy/std": 0.3937928080558777, + "rewards/symbolic_reward_partial_score/mean": 0.86376953125, + "rewards/symbolic_reward_partial_score/std": 0.31700682640075684, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0203101634979248, + "sampling/importance_sampling_ratio/min": 1.0977701947467722e-07, + "sampling/sampling_logp_difference/max": 16.02481460571289, + "sampling/sampling_logp_difference/mean": 0.06179344281554222, + "step": 705 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.1514749750494957, + "epoch": 1.8578947368421053, + "grad_norm": 0.024540159851312637, + "learning_rate": 1e-06, + "loss": 0.0743, + "step": 706 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.15695882588624954, + "epoch": 1.8605263157894738, + "grad_norm": 0.01950952038168907, + "learning_rate": 1e-06, + "loss": 0.0336, + "step": 707 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.13617245852947235, + "epoch": 1.8631578947368421, + "grad_norm": 0.031919222325086594, + "learning_rate": 1e-06, + "loss": 0.1621, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.146484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5933.0, + "completions/mean_length": 2955.310546875, + "completions/mean_terminated_length": 650.6155395507812, + "completions/min_length": 255.0, + "completions/min_terminated_length": 255.0, + "entropy": 0.1501893624663353, + "epoch": 1.8657894736842104, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.027009177953004837, + "learning_rate": 1e-06, + "loss": 0.1153, + "num_tokens": 215862796.0, + "reward": 0.6578521728515625, + "reward_std": 0.2280564308166504, + "rewards/progression_diversity/mean": -0.025331620126962662, + "rewards/progression_diversity/std": 0.08841913938522339, + "rewards/symbolic_reward_accuracy/mean": 0.7109375, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.8011067509651184, + "rewards/symbolic_reward_partial_score/std": 0.3672308921813965, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0195657014846802, + "sampling/importance_sampling_ratio/min": 1.3204642934638855e-10, + "sampling/sampling_logp_difference/max": 22.747867584228516, + "sampling/sampling_logp_difference/mean": 0.05873488262295723, + "step": 709 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.14868657290935516, + "epoch": 1.868421052631579, + "grad_norm": 0.06350129097700119, + "learning_rate": 1e-06, + "loss": 0.1039, + "step": 710 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.1268022656440735, + "epoch": 1.8710526315789475, + "grad_norm": 0.012040435336530209, + "learning_rate": 1e-06, + "loss": 0.2382, + "step": 711 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.14944910258054733, + "epoch": 1.8736842105263158, + "grad_norm": 0.042405128479003906, + "learning_rate": 1e-06, + "loss": 0.0774, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14478.0, + "completions/mean_length": 2155.14453125, + "completions/mean_terminated_length": 615.22509765625, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.14173594117164612, + "epoch": 1.8763157894736842, + "frac_reward_zero_std": 0.15625, + "grad_norm": 0.10443775355815887, + "learning_rate": 1e-06, + "loss": 0.135, + "num_tokens": 217349846.0, + "reward": 0.722713828086853, + "reward_std": 0.21990327537059784, + "rewards/progression_diversity/mean": -0.028423257172107697, + "rewards/progression_diversity/std": 0.10595274716615677, + "rewards/symbolic_reward_accuracy/mean": 0.79296875, + "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, + "rewards/symbolic_reward_partial_score/mean": 0.8435872197151184, + "rewards/symbolic_reward_partial_score/std": 0.33945873379707336, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0188918113708496, + "sampling/importance_sampling_ratio/min": 1.1887545392497145e-09, + "sampling/sampling_logp_difference/max": 20.55035972595215, + "sampling/sampling_logp_difference/mean": 0.059576526284217834, + "step": 713 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.1623135283589363, + "epoch": 1.8789473684210525, + "grad_norm": 0.030199255794286728, + "learning_rate": 1e-06, + "loss": 0.1038, + "step": 714 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.015625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.1539328470826149, + "epoch": 1.881578947368421, + "grad_norm": 0.028396856039762497, + "learning_rate": 1e-06, + "loss": 0.0772, + "step": 715 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.14560812711715698, + "epoch": 1.8842105263157896, + "grad_norm": 0.03335552662611008, + "learning_rate": 1e-06, + "loss": 0.1204, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13825.0, + "completions/mean_length": 2713.19140625, + "completions/mean_terminated_length": 654.889892578125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.1502734199166298, + "epoch": 1.8868421052631579, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.0449819378554821, + "learning_rate": 1e-06, + "loss": 0.1002, + "num_tokens": 219142648.0, + "reward": 0.6932658553123474, + "reward_std": 0.27170705795288086, + "rewards/progression_diversity/mean": -0.033768799155950546, + "rewards/progression_diversity/std": 0.11512268334627151, + "rewards/symbolic_reward_accuracy/mean": 0.763671875, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.8133138418197632, + "rewards/symbolic_reward_partial_score/std": 0.36641624569892883, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0132670402526855, + "sampling/importance_sampling_ratio/min": 2.8605501180398087e-15, + "sampling/sampling_logp_difference/max": 33.487762451171875, + "sampling/sampling_logp_difference/mean": 0.046033672988414764, + "step": 717 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.1434255838394165, + "epoch": 1.8894736842105262, + "grad_norm": 0.020671065896749496, + "learning_rate": 1e-06, + "loss": 0.1992, + "step": 718 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.5, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.671875, + "entropy": 0.12531127035617828, + "epoch": 1.8921052631578947, + "grad_norm": 0.022249706089496613, + "learning_rate": 1e-06, + "loss": 0.2442, + "step": 719 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4453125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.15144315361976624, + "epoch": 1.8947368421052633, + "grad_norm": 0.03603983670473099, + "learning_rate": 1e-06, + "loss": 0.1141, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14424.0, + "completions/mean_length": 2154.701171875, + "completions/mean_terminated_length": 614.7337646484375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.1503245010972023, + "epoch": 1.8973684210526316, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.14981180429458618, + "learning_rate": 1e-06, + "loss": 0.2187, + "num_tokens": 220655679.0, + "reward": 0.7320119738578796, + "reward_std": 0.22837099432945251, + "rewards/progression_diversity/mean": -0.021462373435497284, + "rewards/progression_diversity/std": 0.08626196533441544, + "rewards/symbolic_reward_accuracy/mean": 0.802734375, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.8561197519302368, + "rewards/symbolic_reward_partial_score/std": 0.3233526647090912, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.019932508468628, + "sampling/importance_sampling_ratio/min": 4.195763745121206e-13, + "sampling/sampling_logp_difference/max": 28.499530792236328, + "sampling/sampling_logp_difference/mean": 0.06518127024173737, + "step": 721 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.17081984877586365, + "epoch": 1.9, + "grad_norm": 0.023720135912299156, + "learning_rate": 1e-06, + "loss": 0.0536, + "step": 722 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.16384224593639374, + "epoch": 1.9026315789473685, + "grad_norm": 0.034469299018383026, + "learning_rate": 1e-06, + "loss": 0.1537, + "step": 723 + }, + { + "clip_ratio/high_max": 0.25, + "clip_ratio/high_mean": 0.0078125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.16798613965511322, + "epoch": 1.905263157894737, + "grad_norm": 0.03507945314049721, + "learning_rate": 1e-06, + "loss": 0.0875, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.126953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16240.0, + "completions/mean_length": 2634.119140625, + "completions/mean_terminated_length": 634.6957397460938, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.15430939197540283, + "epoch": 1.9078947368421053, + "frac_reward_zero_std": 0.03125, + "grad_norm": 0.09424492716789246, + "learning_rate": 1e-06, + "loss": 0.1251, + "num_tokens": 222400316.0, + "reward": 0.671546220779419, + "reward_std": 0.3024430274963379, + "rewards/progression_diversity/mean": -0.03776249662041664, + "rewards/progression_diversity/std": 0.12152393907308578, + "rewards/symbolic_reward_accuracy/mean": 0.732421875, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.79638671875, + "rewards/symbolic_reward_partial_score/std": 0.3773951828479767, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0167055130004883, + "sampling/importance_sampling_ratio/min": 6.6573236350974275e-15, + "sampling/sampling_logp_difference/max": 32.64305877685547, + "sampling/sampling_logp_difference/mean": 0.05589202791452408, + "step": 725 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.14185352623462677, + "epoch": 1.9105263157894736, + "grad_norm": 0.06517547369003296, + "learning_rate": 1e-06, + "loss": 0.2122, + "step": 726 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.15953659266233444, + "epoch": 1.913157894736842, + "grad_norm": 0.022468971088528633, + "learning_rate": 1e-06, + "loss": 0.1739, + "step": 727 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4921875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.640625, + "entropy": 0.1624743863940239, + "epoch": 1.9157894736842105, + "grad_norm": 0.009907384403049946, + "learning_rate": 1e-06, + "loss": 0.0998, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.166015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15558.0, + "completions/mean_length": 3540.01953125, + "completions/mean_terminated_length": 983.2552490234375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.1347077265381813, + "epoch": 1.918421052631579, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.13737860321998596, + "learning_rate": 1e-06, + "loss": 0.2226, + "num_tokens": 224636646.0, + "reward": 0.5830076932907104, + "reward_std": 0.30949866771698, + "rewards/progression_diversity/mean": -0.048849739134311676, + "rewards/progression_diversity/std": 0.1297674924135208, + "rewards/symbolic_reward_accuracy/mean": 0.630859375, + "rewards/symbolic_reward_accuracy/std": 0.4830440282821655, + "rewards/symbolic_reward_partial_score/mean": 0.7255859375, + "rewards/symbolic_reward_partial_score/std": 0.4099434018135071, + "rewards/tag_count_reward/mean": -0.126953125, + "rewards/tag_count_reward/std": 0.33324605226516724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0149319171905518, + "sampling/importance_sampling_ratio/min": 4.369263012223404e-22, + "sampling/sampling_logp_difference/max": 49.18227767944336, + "sampling/sampling_logp_difference/mean": 0.04961749166250229, + "step": 729 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0390625, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.15040332078933716, + "epoch": 1.9210526315789473, + "grad_norm": 0.059476714581251144, + "learning_rate": 1e-06, + "loss": 0.1366, + "step": 730 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.046875, + "clip_ratio/low_mean": 0.2890625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.15057525038719177, + "epoch": 1.9236842105263157, + "grad_norm": 0.06425424665212631, + "learning_rate": 1e-06, + "loss": 0.2423, + "step": 731 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.25, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.15043792128562927, + "epoch": 1.9263157894736842, + "grad_norm": 0.022221507504582405, + "learning_rate": 1e-06, + "loss": 0.2027, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14566.0, + "completions/mean_length": 2165.96875, + "completions/mean_terminated_length": 795.9229125976562, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.17250816524028778, + "epoch": 1.9289473684210527, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.7282646298408508, + "learning_rate": 1e-06, + "loss": 0.1429, + "num_tokens": 226135254.0, + "reward": 0.6728378534317017, + "reward_std": 0.29266250133514404, + "rewards/progression_diversity/mean": -0.030671168118715286, + "rewards/progression_diversity/std": 0.11537665873765945, + "rewards/symbolic_reward_accuracy/mean": 0.724609375, + "rewards/symbolic_reward_accuracy/std": 0.44714778661727905, + "rewards/symbolic_reward_partial_score/mean": 0.8147786259651184, + "rewards/symbolic_reward_partial_score/std": 0.35132884979248047, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0237343311309814, + "sampling/importance_sampling_ratio/min": 3.73120688677813e-20, + "sampling/sampling_logp_difference/max": 44.73497009277344, + "sampling/sampling_logp_difference/mean": 0.07423095405101776, + "step": 733 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.18838734924793243, + "epoch": 1.931578947368421, + "grad_norm": 0.019116874784231186, + "learning_rate": 1e-06, + "loss": 0.1143, + "step": 734 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.17143803089857101, + "epoch": 1.9342105263157894, + "grad_norm": 0.1997663974761963, + "learning_rate": 1e-06, + "loss": 0.1573, + "step": 735 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.1800050213932991, + "epoch": 1.936842105263158, + "grad_norm": 0.02649916335940361, + "learning_rate": 1e-06, + "loss": 0.0814, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15817.0, + "completions/mean_length": 2609.44921875, + "completions/mean_terminated_length": 1018.9237060546875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.1816265881061554, + "epoch": 1.9394736842105265, + "frac_reward_zero_std": 0.09375, + "grad_norm": 0.5376126170158386, + "learning_rate": 1e-06, + "loss": 0.1134, + "num_tokens": 227868924.0, + "reward": 0.6405581831932068, + "reward_std": 0.2928628921508789, + "rewards/progression_diversity/mean": -0.04574853554368019, + "rewards/progression_diversity/std": 0.13723419606685638, + "rewards/symbolic_reward_accuracy/mean": 0.69921875, + "rewards/symbolic_reward_accuracy/std": 0.45904624462127686, + "rewards/symbolic_reward_partial_score/mean": 0.7682291269302368, + "rewards/symbolic_reward_partial_score/std": 0.39795729517936707, + "rewards/tag_count_reward/mean": -0.08984375, + "rewards/tag_count_reward/std": 0.2862374484539032, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0221675634384155, + "sampling/importance_sampling_ratio/min": 5.437771195999072e-18, + "sampling/sampling_logp_difference/max": 39.7531623840332, + "sampling/sampling_logp_difference/mean": 0.06859034299850464, + "step": 737 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.18071607500314713, + "epoch": 1.9421052631578948, + "grad_norm": 0.05956464633345604, + "learning_rate": 1e-06, + "loss": 0.1105, + "step": 738 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.16678573191165924, + "epoch": 1.944736842105263, + "grad_norm": 0.16846124827861786, + "learning_rate": 1e-06, + "loss": 0.1718, + "step": 739 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.17628037184476852, + "epoch": 1.9473684210526314, + "grad_norm": 0.04649341478943825, + "learning_rate": 1e-06, + "loss": 0.1492, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16126.0, + "completions/mean_length": 2970.525390625, + "completions/mean_terminated_length": 1454.219482421875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.14562518894672394, + "epoch": 1.95, + "frac_reward_zero_std": 0.03125, + "grad_norm": 2.966646194458008, + "learning_rate": 1e-06, + "loss": 0.2426, + "num_tokens": 229805993.0, + "reward": 0.6170042157173157, + "reward_std": 0.3738676607608795, + "rewards/progression_diversity/mean": -0.07204228639602661, + "rewards/progression_diversity/std": 0.18077191710472107, + "rewards/symbolic_reward_accuracy/mean": 0.67578125, + "rewards/symbolic_reward_accuracy/std": 0.4685399830341339, + "rewards/symbolic_reward_partial_score/mean": 0.74072265625, + "rewards/symbolic_reward_partial_score/std": 0.41318923234939575, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.013148307800293, + "sampling/importance_sampling_ratio/min": 2.707424684788481e-31, + "sampling/sampling_logp_difference/max": 70.38414001464844, + "sampling/sampling_logp_difference/mean": 0.05534841865301132, + "step": 741 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.17404460906982422, + "epoch": 1.9526315789473685, + "grad_norm": 0.013007045723497868, + "learning_rate": 1e-06, + "loss": 0.2101, + "step": 742 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.390625, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.16387271136045456, + "epoch": 1.9552631578947368, + "grad_norm": 0.044528309255838394, + "learning_rate": 1e-06, + "loss": 0.1907, + "step": 743 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.5078125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.71875, + "entropy": 0.18834584951400757, + "epoch": 1.9578947368421051, + "grad_norm": 0.012452390044927597, + "learning_rate": 1e-06, + "loss": 0.1549, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16311.0, + "completions/mean_length": 3217.857421875, + "completions/mean_terminated_length": 1697.5838623046875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.13727252185344696, + "epoch": 1.9605263157894737, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4300470352172852, + "learning_rate": 1e-06, + "loss": 0.2051, + "num_tokens": 231876032.0, + "reward": 0.619334876537323, + "reward_std": 0.37329334020614624, + "rewards/progression_diversity/mean": -0.07823123037815094, + "rewards/progression_diversity/std": 0.17627538740634918, + "rewards/symbolic_reward_accuracy/mean": 0.681640625, + "rewards/symbolic_reward_accuracy/std": 0.46629536151885986, + "rewards/symbolic_reward_partial_score/mean": 0.7376302480697632, + "rewards/symbolic_reward_partial_score/std": 0.4179321825504303, + "rewards/tag_count_reward/mean": -0.1015625, + "rewards/tag_count_reward/std": 0.30236753821372986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.012220859527588, + "sampling/importance_sampling_ratio/min": 5.182242701661106e-28, + "sampling/sampling_logp_difference/max": 62.827144622802734, + "sampling/sampling_logp_difference/mean": 0.056160710752010345, + "step": 745 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.3046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.1606038585305214, + "epoch": 1.9631578947368422, + "grad_norm": 0.028117861598730087, + "learning_rate": 1e-06, + "loss": 0.2423, + "step": 746 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.03125, + "clip_ratio/low_mean": 0.2578125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.1709175780415535, + "epoch": 1.9657894736842105, + "grad_norm": 0.67425537109375, + "learning_rate": 1e-06, + "loss": 0.2054, + "step": 747 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.3125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.15508843958377838, + "epoch": 1.9684210526315788, + "grad_norm": 0.06400001794099808, + "learning_rate": 1e-06, + "loss": 0.2487, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15746.0, + "completions/mean_length": 2000.880859375, + "completions/mean_terminated_length": 1293.5142822265625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.21451418101787567, + "epoch": 1.9710526315789474, + "frac_reward_zero_std": 0.0, + "grad_norm": 5.9699931144714355, + "learning_rate": 1e-06, + "loss": 0.0579, + "num_tokens": 233257411.0, + "reward": 0.651739239692688, + "reward_std": 0.3094671964645386, + "rewards/progression_diversity/mean": -0.0653361827135086, + "rewards/progression_diversity/std": 0.19041091203689575, + "rewards/symbolic_reward_accuracy/mean": 0.7109375, + "rewards/symbolic_reward_accuracy/std": 0.45377036929130554, + "rewards/symbolic_reward_partial_score/mean": 0.76904296875, + "rewards/symbolic_reward_partial_score/std": 0.39429956674575806, + "rewards/tag_count_reward/mean": -0.048828125, + "rewards/tag_count_reward/std": 0.2157193273305893, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.021796703338623, + "sampling/importance_sampling_ratio/min": 8.134976418108564e-29, + "sampling/sampling_logp_difference/max": 64.67879486083984, + "sampling/sampling_logp_difference/mean": 0.07527394592761993, + "step": 749 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.5078125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.640625, + "entropy": 0.17407967895269394, + "epoch": 1.973684210526316, + "grad_norm": 0.060622621327638626, + "learning_rate": 1e-06, + "loss": 0.1803, + "step": 750 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.5859375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.7265625, + "entropy": 0.19498290121555328, + "epoch": 1.9763157894736842, + "grad_norm": 0.015352616086602211, + "learning_rate": 1e-06, + "loss": 0.181, + "step": 751 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.6875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.78125, + "entropy": 0.24272602796554565, + "epoch": 1.9789473684210526, + "grad_norm": 0.01805986650288105, + "learning_rate": 1e-06, + "loss": 0.0971, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14517.0, + "completions/mean_length": 2323.900390625, + "completions/mean_terminated_length": 1448.79052734375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.18802518397569656, + "epoch": 1.981578947368421, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.341567039489746, + "learning_rate": 1e-06, + "loss": 0.1657, + "num_tokens": 234866896.0, + "reward": 0.6480075120925903, + "reward_std": 0.29000890254974365, + "rewards/progression_diversity/mean": -0.08694960176944733, + "rewards/progression_diversity/std": 0.21665886044502258, + "rewards/symbolic_reward_accuracy/mean": 0.70703125, + "rewards/symbolic_reward_accuracy/std": 0.455569326877594, + "rewards/symbolic_reward_partial_score/mean": 0.7762044072151184, + "rewards/symbolic_reward_partial_score/std": 0.3934388756752014, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.02323579788208, + "sampling/importance_sampling_ratio/min": 1.605486811297672e-22, + "sampling/sampling_logp_difference/max": 50.18344497680664, + "sampling/sampling_logp_difference/mean": 0.08397974073886871, + "step": 753 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.21599113941192627, + "epoch": 1.9842105263157894, + "grad_norm": 0.01323725562542677, + "learning_rate": 1e-06, + "loss": 0.1011, + "step": 754 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.22974644601345062, + "epoch": 1.986842105263158, + "grad_norm": 0.02560954913496971, + "learning_rate": 1e-06, + "loss": 0.1258, + "step": 755 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2734375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.24319110810756683, + "epoch": 1.9894736842105263, + "grad_norm": 0.5963929891586304, + "learning_rate": 1e-06, + "loss": 0.1971, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14817.0, + "completions/mean_length": 1997.1171875, + "completions/mean_terminated_length": 1164.818115234375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.22843757271766663, + "epoch": 1.9921052631578946, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.2587209939956665, + "learning_rate": 1e-06, + "loss": 0.0798, + "num_tokens": 236294732.0, + "reward": 0.694200873374939, + "reward_std": 0.2883387506008148, + "rewards/progression_diversity/mean": -0.07210341095924377, + "rewards/progression_diversity/std": 0.2020467221736908, + "rewards/symbolic_reward_accuracy/mean": 0.7578125, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.8216145634651184, + "rewards/symbolic_reward_partial_score/std": 0.3572600483894348, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0223069190979004, + "sampling/importance_sampling_ratio/min": 8.752450397611482e-24, + "sampling/sampling_logp_difference/max": 53.092708587646484, + "sampling/sampling_logp_difference/mean": 0.10721838474273682, + "step": 757 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.244800366461277, + "epoch": 1.9947368421052631, + "grad_norm": 0.015508892014622688, + "learning_rate": 1e-06, + "loss": 0.1874, + "step": 758 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.26591283082962036, + "epoch": 1.9973684210526317, + "grad_norm": 0.013314544223248959, + "learning_rate": 1e-06, + "loss": 0.1516, + "step": 759 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.2783430367708206, + "epoch": 2.0, + "grad_norm": 0.01680191606283188, + "learning_rate": 1e-06, + "loss": 0.1542, + "step": 760 + }, + { + "epoch": 2.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.020263671875, + "eval_completions/max_length": 15967.3125, + "eval_completions/max_terminated_length": 12734.125, + "eval_completions/mean_length": 1709.68359375, + "eval_completions/mean_terminated_length": 1407.859390258789, + "eval_completions/min_length": 175.84375, + "eval_completions/min_terminated_length": 175.84375, + "eval_entropy": 0.18563631316646934, + "eval_frac_reward_zero_std": 0.09375, + "eval_loss": 0.05905697122216225, + "eval_num_tokens": 236294732.0, + "eval_reward": 0.6300447043031454, + "eval_reward_std": 0.32271731412038207, + "eval_rewards/progression_diversity/mean": -0.08879435807466507, + "eval_rewards/progression_diversity/std": 0.2320320950821042, + "eval_rewards/symbolic_reward_accuracy/mean": 0.681396484375, + "eval_rewards/symbolic_reward_accuracy/std": 0.45585051737725735, + "eval_rewards/symbolic_reward_partial_score/mean": 0.7582194041460752, + "eval_rewards/symbolic_reward_partial_score/std": 0.38103948533535004, + "eval_rewards/tag_count_reward/mean": -0.0537109375, + "eval_rewards/tag_count_reward/std": 0.22261275839991868, + "eval_runtime": 3967.3544, + "eval_samples_per_second": 0.063, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0347554087638855, + "eval_sampling/importance_sampling_ratio/min": 1.7375132229474076e-18, + "eval_sampling/sampling_logp_difference/max": 45.634475350379944, + "eval_sampling/sampling_logp_difference/mean": 0.12127477861940861, + "eval_steps_per_second": 0.001, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16118.0, + "completions/mean_length": 1692.767578125, + "completions/mean_terminated_length": 1340.1781005859375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.22220094501972198, + "epoch": 2.0026315789473683, + "frac_reward_zero_std": 0.1875, + "grad_norm": 3.1163506507873535, + "learning_rate": 1e-06, + "loss": 0.0849, + "num_tokens": 237554389.0, + "reward": 0.6846272945404053, + "reward_std": 0.28297704458236694, + "rewards/progression_diversity/mean": -0.07242967188358307, + "rewards/progression_diversity/std": 0.20485153794288635, + "rewards/symbolic_reward_accuracy/mean": 0.744140625, + "rewards/symbolic_reward_accuracy/std": 0.43676990270614624, + "rewards/symbolic_reward_partial_score/mean": 0.8125, + "rewards/symbolic_reward_partial_score/std": 0.35843971371650696, + "rewards/tag_count_reward/mean": -0.048828125, + "rewards/tag_count_reward/std": 0.2157193273305893, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0238771438598633, + "sampling/importance_sampling_ratio/min": 3.568013560231561e-26, + "sampling/sampling_logp_difference/max": 58.5952033996582, + "sampling/sampling_logp_difference/mean": 0.12895509600639343, + "step": 761 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.2493431717157364, + "epoch": 2.0052631578947366, + "grad_norm": 0.010103636421263218, + "learning_rate": 1e-06, + "loss": 0.165, + "step": 762 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3100139796733856, + "epoch": 2.0078947368421054, + "grad_norm": 0.04030955210328102, + "learning_rate": 1e-06, + "loss": 0.149, + "step": 763 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4296875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.3300608843564987, + "epoch": 2.0105263157894737, + "grad_norm": 0.010126029141247272, + "learning_rate": 1e-06, + "loss": 0.0765, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14986.0, + "completions/mean_length": 1680.546875, + "completions/mean_terminated_length": 1447.1588134765625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.21983007341623306, + "epoch": 2.013157894736842, + "frac_reward_zero_std": 0.15625, + "grad_norm": 3.134493589401245, + "learning_rate": 1e-06, + "loss": 0.1212, + "num_tokens": 238821645.0, + "reward": 0.6834567785263062, + "reward_std": 0.27827733755111694, + "rewards/progression_diversity/mean": -0.07717426866292953, + "rewards/progression_diversity/std": 0.20578458905220032, + "rewards/symbolic_reward_accuracy/mean": 0.7421875, + "rewards/symbolic_reward_accuracy/std": 0.43785804510116577, + "rewards/symbolic_reward_partial_score/mean": 0.81396484375, + "rewards/symbolic_reward_partial_score/std": 0.35770609974861145, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0259695053100586, + "sampling/importance_sampling_ratio/min": 3.113477761470245e-26, + "sampling/sampling_logp_difference/max": 58.73147201538086, + "sampling/sampling_logp_difference/mean": 0.13548921048641205, + "step": 765 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.390625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.3355230987071991, + "epoch": 2.0157894736842104, + "grad_norm": 0.014872807078063488, + "learning_rate": 1e-06, + "loss": 0.0679, + "step": 766 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.5302972793579102, + "epoch": 2.018421052631579, + "grad_norm": 0.02062629908323288, + "learning_rate": 1e-06, + "loss": 0.0955, + "step": 767 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.718237966299057, + "epoch": 2.0210526315789474, + "grad_norm": 0.02212103269994259, + "learning_rate": 1e-06, + "loss": 0.1666, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15111.0, + "completions/mean_length": 1686.498046875, + "completions/mean_terminated_length": 1242.911376953125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.23345400393009186, + "epoch": 2.0236842105263158, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.634035050868988, + "learning_rate": 1e-06, + "loss": 0.074, + "num_tokens": 240076428.0, + "reward": 0.6818051934242249, + "reward_std": 0.29768824577331543, + "rewards/progression_diversity/mean": -0.07144007831811905, + "rewards/progression_diversity/std": 0.19266043603420258, + "rewards/symbolic_reward_accuracy/mean": 0.740234375, + "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, + "rewards/symbolic_reward_partial_score/mean": 0.8134765625, + "rewards/symbolic_reward_partial_score/std": 0.3538402020931244, + "rewards/tag_count_reward/mean": -0.056640625, + "rewards/tag_count_reward/std": 0.23138070106506348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0273222923278809, + "sampling/importance_sampling_ratio/min": 1.1702893084804623e-27, + "sampling/sampling_logp_difference/max": 62.01254653930664, + "sampling/sampling_logp_difference/mean": 0.14426520466804504, + "step": 769 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.2265625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2532157674431801, + "epoch": 2.026315789473684, + "grad_norm": 0.07092837989330292, + "learning_rate": 1e-06, + "loss": 0.1038, + "step": 770 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3199596703052521, + "epoch": 2.028947368421053, + "grad_norm": 0.031068623065948486, + "learning_rate": 1e-06, + "loss": 0.0876, + "step": 771 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3354818820953369, + "epoch": 2.031578947368421, + "grad_norm": 0.03109137900173664, + "learning_rate": 1e-06, + "loss": 0.1269, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16351.0, + "completions/mean_length": 2114.2578125, + "completions/mean_terminated_length": 1624.1859130859375, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.2218097448348999, + "epoch": 2.0342105263157895, + "frac_reward_zero_std": 0.0625, + "grad_norm": 11.015893936157227, + "learning_rate": 1e-06, + "loss": 0.1472, + "num_tokens": 241569232.0, + "reward": 0.6609280109405518, + "reward_std": 0.3157079815864563, + "rewards/progression_diversity/mean": -0.09372584521770477, + "rewards/progression_diversity/std": 0.21451076865196228, + "rewards/symbolic_reward_accuracy/mean": 0.716796875, + "rewards/symbolic_reward_accuracy/std": 0.4509948492050171, + "rewards/symbolic_reward_partial_score/mean": 0.7921549081802368, + "rewards/symbolic_reward_partial_score/std": 0.37911295890808105, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.011353611946106, + "sampling/importance_sampling_ratio/min": 4.881230540264573e-27, + "sampling/sampling_logp_difference/max": 60.58440017700195, + "sampling/sampling_logp_difference/mean": 0.21283593773841858, + "step": 773 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.484375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6640625, + "entropy": 0.29898831248283386, + "epoch": 2.036842105263158, + "grad_norm": 0.006906085181981325, + "learning_rate": 1e-06, + "loss": 0.1731, + "step": 774 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.5546875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6953125, + "entropy": 0.378355011343956, + "epoch": 2.039473684210526, + "grad_norm": 0.26340046525001526, + "learning_rate": 1e-06, + "loss": 0.1015, + "step": 775 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.546875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.75, + "entropy": 0.5297541320323944, + "epoch": 2.042105263157895, + "grad_norm": 0.008265807293355465, + "learning_rate": 1e-06, + "loss": 0.1864, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13564.0, + "completions/mean_length": 1399.826171875, + "completions/mean_terminated_length": 1101.336669921875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.23068619519472122, + "epoch": 2.044736842105263, + "frac_reward_zero_std": 0.1875, + "grad_norm": 4.750518321990967, + "learning_rate": 1e-06, + "loss": 0.1014, + "num_tokens": 242665047.0, + "reward": 0.7356716394424438, + "reward_std": 0.2675522565841675, + "rewards/progression_diversity/mean": -0.05099809169769287, + "rewards/progression_diversity/std": 0.1577788144350052, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.8575845956802368, + "rewards/symbolic_reward_partial_score/std": 0.3261964023113251, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.020838737487793, + "sampling/importance_sampling_ratio/min": 2.984163448692399e-29, + "sampling/sampling_logp_difference/max": 65.68164825439453, + "sampling/sampling_logp_difference/mean": 0.23801082372665405, + "step": 777 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.27487921714782715, + "epoch": 2.0473684210526315, + "grad_norm": 0.010955928824841976, + "learning_rate": 1e-06, + "loss": 0.1502, + "step": 778 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3157334327697754, + "epoch": 2.05, + "grad_norm": 0.04367856681346893, + "learning_rate": 1e-06, + "loss": 0.0794, + "step": 779 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.31336382031440735, + "epoch": 2.0526315789473686, + "grad_norm": 0.021015914157032967, + "learning_rate": 1e-06, + "loss": 0.0561, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15221.0, + "completions/mean_length": 1509.474609375, + "completions/mean_terminated_length": 1152.486083984375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.22252874076366425, + "epoch": 2.055263157894737, + "frac_reward_zero_std": 0.09375, + "grad_norm": 1.6408276557922363, + "learning_rate": 1e-06, + "loss": 0.1134, + "num_tokens": 243863402.0, + "reward": 0.7193964123725891, + "reward_std": 0.28412267565727234, + "rewards/progression_diversity/mean": -0.037899717688560486, + "rewards/progression_diversity/std": 0.12527695298194885, + "rewards/symbolic_reward_accuracy/mean": 0.78515625, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.8458659052848816, + "rewards/symbolic_reward_partial_score/std": 0.32760530710220337, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0266220569610596, + "sampling/importance_sampling_ratio/min": 4.3695186577606616e-27, + "sampling/sampling_logp_difference/max": 60.69514465332031, + "sampling/sampling_logp_difference/mean": 0.12954550981521606, + "step": 781 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.22228249162435532, + "epoch": 2.057894736842105, + "grad_norm": 0.19859381020069122, + "learning_rate": 1e-06, + "loss": 0.131, + "step": 782 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.20911472290754318, + "epoch": 2.0605263157894735, + "grad_norm": 0.02292071282863617, + "learning_rate": 1e-06, + "loss": 0.1517, + "step": 783 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2361936792731285, + "epoch": 2.0631578947368423, + "grad_norm": 0.015427610836923122, + "learning_rate": 1e-06, + "loss": 0.0483, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15893.0, + "completions/mean_length": 1741.46875, + "completions/mean_terminated_length": 1238.593994140625, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.21045714616775513, + "epoch": 2.0657894736842106, + "frac_reward_zero_std": 0.125, + "grad_norm": 4.345335006713867, + "learning_rate": 1e-06, + "loss": 0.1222, + "num_tokens": 245182202.0, + "reward": 0.7289832830429077, + "reward_std": 0.29325735569000244, + "rewards/progression_diversity/mean": -0.046007223427295685, + "rewards/progression_diversity/std": 0.139408677816391, + "rewards/symbolic_reward_accuracy/mean": 0.802734375, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.8435872793197632, + "rewards/symbolic_reward_partial_score/std": 0.34268614649772644, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0198302268981934, + "sampling/importance_sampling_ratio/min": 9.978149817821409e-34, + "sampling/sampling_logp_difference/max": 75.98749542236328, + "sampling/sampling_logp_difference/mean": 0.21147558093070984, + "step": 785 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2157856523990631, + "epoch": 2.068421052631579, + "grad_norm": 0.01274153497070074, + "learning_rate": 1e-06, + "loss": 0.2087, + "step": 786 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2510286718606949, + "epoch": 2.0710526315789473, + "grad_norm": 0.06488237529993057, + "learning_rate": 1e-06, + "loss": 0.1405, + "step": 787 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2752929925918579, + "epoch": 2.0736842105263156, + "grad_norm": 0.018524806946516037, + "learning_rate": 1e-06, + "loss": 0.1161, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14714.0, + "completions/mean_length": 1852.96875, + "completions/mean_terminated_length": 1262.2763671875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.22035640478134155, + "epoch": 2.0763157894736843, + "frac_reward_zero_std": 0.1875, + "grad_norm": 3.2875747680664062, + "learning_rate": 1e-06, + "loss": 0.1068, + "num_tokens": 246517706.0, + "reward": 0.6830227971076965, + "reward_std": 0.23248738050460815, + "rewards/progression_diversity/mean": -0.04245023429393768, + "rewards/progression_diversity/std": 0.12012242525815964, + "rewards/symbolic_reward_accuracy/mean": 0.740234375, + "rewards/symbolic_reward_accuracy/std": 0.4389347732067108, + "rewards/symbolic_reward_partial_score/mean": 0.8165690302848816, + "rewards/symbolic_reward_partial_score/std": 0.35112887620925903, + "rewards/tag_count_reward/mean": -0.056640625, + "rewards/tag_count_reward/std": 0.23138070106506348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.023137092590332, + "sampling/importance_sampling_ratio/min": 1.018668067894736e-33, + "sampling/sampling_logp_difference/max": 75.96681213378906, + "sampling/sampling_logp_difference/mean": 0.3000330924987793, + "step": 789 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.453125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.23726185411214828, + "epoch": 2.0789473684210527, + "grad_norm": 0.875003457069397, + "learning_rate": 1e-06, + "loss": 0.1011, + "step": 790 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5859375, + "entropy": 0.3143727779388428, + "epoch": 2.081578947368421, + "grad_norm": 0.024129139259457588, + "learning_rate": 1e-06, + "loss": 0.1284, + "step": 791 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4921875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6015625, + "entropy": 0.2929740250110626, + "epoch": 2.0842105263157893, + "grad_norm": 0.013345804065465927, + "learning_rate": 1e-06, + "loss": 0.0514, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15700.0, + "completions/mean_length": 1659.0625, + "completions/mean_terminated_length": 903.1622314453125, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.20717183500528336, + "epoch": 2.086842105263158, + "frac_reward_zero_std": 0.21875, + "grad_norm": 6.014965057373047, + "learning_rate": 1e-06, + "loss": 0.1198, + "num_tokens": 247780298.0, + "reward": 0.7235679626464844, + "reward_std": 0.24241438508033752, + "rewards/progression_diversity/mean": -0.030903812497854233, + "rewards/progression_diversity/std": 0.11096394807100296, + "rewards/symbolic_reward_accuracy/mean": 0.7890625, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.85693359375, + "rewards/symbolic_reward_partial_score/std": 0.3159598112106323, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0264015197753906, + "sampling/importance_sampling_ratio/min": 1.2161178811991868e-37, + "sampling/sampling_logp_difference/max": 84.99998474121094, + "sampling/sampling_logp_difference/mean": 0.20792850852012634, + "step": 793 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.20646440982818604, + "epoch": 2.0894736842105264, + "grad_norm": 0.08151978254318237, + "learning_rate": 1e-06, + "loss": 0.1191, + "step": 794 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.22666236013174057, + "epoch": 2.0921052631578947, + "grad_norm": 0.016352776437997818, + "learning_rate": 1e-06, + "loss": 0.1051, + "step": 795 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.23344239592552185, + "epoch": 2.094736842105263, + "grad_norm": 0.0139634869992733, + "learning_rate": 1e-06, + "loss": 0.1076, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16007.0, + "completions/mean_length": 1441.34375, + "completions/mean_terminated_length": 959.3225708007812, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.23309873044490814, + "epoch": 2.0973684210526318, + "frac_reward_zero_std": 0.3125, + "grad_norm": 4.238341808319092, + "learning_rate": 1e-06, + "loss": 0.028, + "num_tokens": 248903546.0, + "reward": 0.7395845651626587, + "reward_std": 0.195578932762146, + "rewards/progression_diversity/mean": -0.030801229178905487, + "rewards/progression_diversity/std": 0.1092456728219986, + "rewards/symbolic_reward_accuracy/mean": 0.802734375, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.8758137822151184, + "rewards/symbolic_reward_partial_score/std": 0.29930591583251953, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0348535776138306, + "sampling/importance_sampling_ratio/min": 7.160635152699815e-43, + "sampling/sampling_logp_difference/max": 97.04222106933594, + "sampling/sampling_logp_difference/mean": 0.22585123777389526, + "step": 797 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.22810588777065277, + "epoch": 2.1, + "grad_norm": 0.013128053396940231, + "learning_rate": 1e-06, + "loss": 0.0723, + "step": 798 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.24390096962451935, + "epoch": 2.1026315789473684, + "grad_norm": 0.007489512674510479, + "learning_rate": 1e-06, + "loss": 0.1571, + "step": 799 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.22683186829090118, + "epoch": 2.1052631578947367, + "grad_norm": 0.012858620844781399, + "learning_rate": 1e-06, + "loss": 0.128, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13867.0, + "completions/mean_length": 1304.953125, + "completions/mean_terminated_length": 849.85107421875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.2136317864060402, + "epoch": 2.1078947368421055, + "frac_reward_zero_std": 0.1875, + "grad_norm": 9.192523002624512, + "learning_rate": 1e-06, + "loss": 0.1348, + "num_tokens": 249969474.0, + "reward": 0.7585318684577942, + "reward_std": 0.23733538389205933, + "rewards/progression_diversity/mean": -0.030604764819145203, + "rewards/progression_diversity/std": 0.1174231469631195, + "rewards/symbolic_reward_accuracy/mean": 0.82421875, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.8914388418197632, + "rewards/symbolic_reward_partial_score/std": 0.2718009948730469, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0298779010772705, + "sampling/importance_sampling_ratio/min": 2.0962023727834939e-41, + "sampling/sampling_logp_difference/max": 93.6658706665039, + "sampling/sampling_logp_difference/mean": 0.1556258201599121, + "step": 801 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.20574460923671722, + "epoch": 2.110526315789474, + "grad_norm": 0.00943561177700758, + "learning_rate": 1e-06, + "loss": 0.1348, + "step": 802 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.22100205719470978, + "epoch": 2.113157894736842, + "grad_norm": 0.018543722108006477, + "learning_rate": 1e-06, + "loss": 0.0386, + "step": 803 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23784782737493515, + "epoch": 2.1157894736842104, + "grad_norm": 0.1003841683268547, + "learning_rate": 1e-06, + "loss": 0.0778, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13121.0, + "completions/mean_length": 1769.97265625, + "completions/mean_terminated_length": 860.3859252929688, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.22240455448627472, + "epoch": 2.1184210526315788, + "frac_reward_zero_std": 0.125, + "grad_norm": 10.562141418457031, + "learning_rate": 1e-06, + "loss": 0.1285, + "num_tokens": 251296532.0, + "reward": 0.736179769039154, + "reward_std": 0.24343962967395782, + "rewards/progression_diversity/mean": -0.03436863049864769, + "rewards/progression_diversity/std": 0.12254931777715683, + "rewards/symbolic_reward_accuracy/mean": 0.8046875, + "rewards/symbolic_reward_accuracy/std": 0.3968288004398346, + "rewards/symbolic_reward_partial_score/mean": 0.8671875, + "rewards/symbolic_reward_partial_score/std": 0.31175029277801514, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0263206958770752, + "sampling/importance_sampling_ratio/min": 1.7586295727276454e-42, + "sampling/sampling_logp_difference/max": 96.14400482177734, + "sampling/sampling_logp_difference/mean": 0.2061021327972412, + "step": 805 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.215851292014122, + "epoch": 2.1210526315789475, + "grad_norm": 0.012161717750132084, + "learning_rate": 1e-06, + "loss": 0.1275, + "step": 806 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.20706567913293839, + "epoch": 2.123684210526316, + "grad_norm": 0.007896292954683304, + "learning_rate": 1e-06, + "loss": 0.1679, + "step": 807 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.390625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.21126502752304077, + "epoch": 2.126315789473684, + "grad_norm": 0.007001427933573723, + "learning_rate": 1e-06, + "loss": 0.1365, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11642.0, + "completions/mean_length": 1819.16796875, + "completions/mean_terminated_length": 848.17919921875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.20564715564250946, + "epoch": 2.1289473684210525, + "frac_reward_zero_std": 0.25, + "grad_norm": 6.921549320220947, + "learning_rate": 1e-06, + "loss": 0.1125, + "num_tokens": 252653450.0, + "reward": 0.7197491526603699, + "reward_std": 0.20420284569263458, + "rewards/progression_diversity/mean": -0.04169023782014847, + "rewards/progression_diversity/std": 0.13636480271816254, + "rewards/symbolic_reward_accuracy/mean": 0.779296875, + "rewards/symbolic_reward_accuracy/std": 0.4151262938976288, + "rewards/symbolic_reward_partial_score/mean": 0.8673502206802368, + "rewards/symbolic_reward_partial_score/std": 0.3025065064430237, + "rewards/tag_count_reward/mean": -0.076171875, + "rewards/tag_count_reward/std": 0.26553234457969666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0277680158615112, + "sampling/importance_sampling_ratio/min": 5.605193857299268e-45, + "sampling/sampling_logp_difference/max": 102.0, + "sampling/sampling_logp_difference/mean": 0.18684858083724976, + "step": 809 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.20571620762348175, + "epoch": 2.1315789473684212, + "grad_norm": 0.033720117062330246, + "learning_rate": 1e-06, + "loss": 0.1593, + "step": 810 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.22204747796058655, + "epoch": 2.1342105263157896, + "grad_norm": 0.20377802848815918, + "learning_rate": 1e-06, + "loss": 0.1086, + "step": 811 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.20914125442504883, + "epoch": 2.136842105263158, + "grad_norm": 0.025145720690488815, + "learning_rate": 1e-06, + "loss": 0.1232, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15292.0, + "completions/mean_length": 1469.455078125, + "completions/mean_terminated_length": 988.3406982421875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.21566122770309448, + "epoch": 2.139473684210526, + "frac_reward_zero_std": 0.21875, + "grad_norm": 1.5204887390136719, + "learning_rate": 1e-06, + "loss": 0.0784, + "num_tokens": 253832819.0, + "reward": 0.7044533491134644, + "reward_std": 0.23354381322860718, + "rewards/progression_diversity/mean": -0.033183254301548004, + "rewards/progression_diversity/std": 0.1218782439827919, + "rewards/symbolic_reward_accuracy/mean": 0.75, + "rewards/symbolic_reward_accuracy/std": 0.43343618512153625, + "rewards/symbolic_reward_partial_score/mean": 0.8701171875, + "rewards/symbolic_reward_partial_score/std": 0.28412559628486633, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0307013988494873, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 105.5, + "sampling/sampling_logp_difference/mean": 0.2200557291507721, + "step": 813 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2126343920826912, + "epoch": 2.1421052631578945, + "grad_norm": 0.019540801644325256, + "learning_rate": 1e-06, + "loss": 0.1643, + "step": 814 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2253113090991974, + "epoch": 2.1447368421052633, + "grad_norm": 0.034518882632255554, + "learning_rate": 1e-06, + "loss": 0.0848, + "step": 815 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.21958737820386887, + "epoch": 2.1473684210526316, + "grad_norm": 0.022237155586481094, + "learning_rate": 1e-06, + "loss": 0.0616, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16089.0, + "completions/mean_length": 1241.01171875, + "completions/mean_terminated_length": 1091.672607421875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.2650395929813385, + "epoch": 2.15, + "frac_reward_zero_std": 0.25, + "grad_norm": 8.908076286315918, + "learning_rate": 1e-06, + "loss": 0.0686, + "num_tokens": 254887545.0, + "reward": 0.7385146617889404, + "reward_std": 0.21202999353408813, + "rewards/progression_diversity/mean": -0.04013665392994881, + "rewards/progression_diversity/std": 0.136456698179245, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.8680013418197632, + "rewards/symbolic_reward_partial_score/std": 0.3045817017555237, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0395100116729736, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 112.35920715332031, + "sampling/sampling_logp_difference/mean": 0.4265938699245453, + "step": 817 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2493005096912384, + "epoch": 2.1526315789473682, + "grad_norm": 0.020062845200300217, + "learning_rate": 1e-06, + "loss": 0.1432, + "step": 818 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.23568686097860336, + "epoch": 2.155263157894737, + "grad_norm": 0.028896203264594078, + "learning_rate": 1e-06, + "loss": 0.1101, + "step": 819 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.24035780876874924, + "epoch": 2.1578947368421053, + "grad_norm": 0.01008315198123455, + "learning_rate": 1e-06, + "loss": 0.0286, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12528.0, + "completions/mean_length": 1109.306640625, + "completions/mean_terminated_length": 958.6686401367188, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.3168548345565796, + "epoch": 2.1605263157894736, + "frac_reward_zero_std": 0.28125, + "grad_norm": 7.0420708656311035, + "learning_rate": 1e-06, + "loss": 0.079, + "num_tokens": 255865494.0, + "reward": 0.7541958093643188, + "reward_std": 0.2114800214767456, + "rewards/progression_diversity/mean": -0.03452695906162262, + "rewards/progression_diversity/std": 0.1269940882921219, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.88818359375, + "rewards/symbolic_reward_partial_score/std": 0.2839567959308624, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0436235666275024, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 119.89215850830078, + "sampling/sampling_logp_difference/mean": 0.4233931005001068, + "step": 821 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2538767158985138, + "epoch": 2.163157894736842, + "grad_norm": 0.014526182785630226, + "learning_rate": 1e-06, + "loss": 0.0951, + "step": 822 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2911252975463867, + "epoch": 2.1657894736842107, + "grad_norm": 0.010871890932321548, + "learning_rate": 1e-06, + "loss": 0.0845, + "step": 823 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2438528761267662, + "epoch": 2.168421052631579, + "grad_norm": 0.008227149024605751, + "learning_rate": 1e-06, + "loss": 0.1009, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13903.0, + "completions/mean_length": 1524.4453125, + "completions/mean_terminated_length": 1198.1876220703125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.32260115444660187, + "epoch": 2.1710526315789473, + "frac_reward_zero_std": 0.28125, + "grad_norm": 18.730411529541016, + "learning_rate": 1e-06, + "loss": 0.0989, + "num_tokens": 257055834.0, + "reward": 0.7190285921096802, + "reward_std": 0.2212091088294983, + "rewards/progression_diversity/mean": -0.04050877317786217, + "rewards/progression_diversity/std": 0.1305409073829651, + "rewards/symbolic_reward_accuracy/mean": 0.78515625, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.8544921875, + "rewards/symbolic_reward_partial_score/std": 0.31850993633270264, + "rewards/tag_count_reward/mean": -0.080078125, + "rewards/tag_count_reward/std": 0.271679550409317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0542526245117188, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 130.9848175048828, + "sampling/sampling_logp_difference/mean": 0.42119100689888, + "step": 825 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.28623804450035095, + "epoch": 2.1736842105263157, + "grad_norm": 0.009265235625207424, + "learning_rate": 1e-06, + "loss": 0.0857, + "step": 826 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.3048030138015747, + "epoch": 2.1763157894736844, + "grad_norm": 0.5141742825508118, + "learning_rate": 1e-06, + "loss": 0.0416, + "step": 827 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.33180132508277893, + "epoch": 2.1789473684210527, + "grad_norm": 0.01808958500623703, + "learning_rate": 1e-06, + "loss": 0.1253, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14963.0, + "completions/mean_length": 1386.966796875, + "completions/mean_terminated_length": 996.2625122070312, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.3035607635974884, + "epoch": 2.181578947368421, + "frac_reward_zero_std": 0.28125, + "grad_norm": 20.19339370727539, + "learning_rate": 1e-06, + "loss": 0.1055, + "num_tokens": 258146409.0, + "reward": 0.7440898418426514, + "reward_std": 0.20957735180854797, + "rewards/progression_diversity/mean": -0.029496124014258385, + "rewards/progression_diversity/std": 0.10593532025814056, + "rewards/symbolic_reward_accuracy/mean": 0.814453125, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.8712565302848816, + "rewards/symbolic_reward_partial_score/std": 0.30432677268981934, + "rewards/tag_count_reward/mean": -0.056640625, + "rewards/tag_count_reward/std": 0.23138070106506348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059896469116211, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 135.2486572265625, + "sampling/sampling_logp_difference/mean": 0.5292673707008362, + "step": 829 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.31164059042930603, + "epoch": 2.1842105263157894, + "grad_norm": 4.756748199462891, + "learning_rate": 1e-06, + "loss": 0.0966, + "step": 830 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.32979193329811096, + "epoch": 2.1868421052631577, + "grad_norm": 0.020023655146360397, + "learning_rate": 1e-06, + "loss": 0.1029, + "step": 831 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.30019381642341614, + "epoch": 2.1894736842105265, + "grad_norm": 0.01554365735501051, + "learning_rate": 1e-06, + "loss": 0.0552, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14676.0, + "completions/mean_length": 1498.107421875, + "completions/mean_terminated_length": 955.7064819335938, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.4383620619773865, + "epoch": 2.192105263157895, + "frac_reward_zero_std": 0.21875, + "grad_norm": 26.805294036865234, + "learning_rate": 1e-06, + "loss": 0.1879, + "num_tokens": 259308064.0, + "reward": 0.708129346370697, + "reward_std": 0.2203439176082611, + "rewards/progression_diversity/mean": -0.03179560601711273, + "rewards/progression_diversity/std": 0.11374954134225845, + "rewards/symbolic_reward_accuracy/mean": 0.765625, + "rewards/symbolic_reward_accuracy/std": 0.42402184009552, + "rewards/symbolic_reward_partial_score/mean": 0.8465169072151184, + "rewards/symbolic_reward_partial_score/std": 0.31652384996414185, + "rewards/tag_count_reward/mean": -0.048828125, + "rewards/tag_count_reward/std": 0.2157193273305893, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.056193470954895, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 135.6931915283203, + "sampling/sampling_logp_difference/mean": 0.8543011546134949, + "step": 833 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3004182279109955, + "epoch": 2.194736842105263, + "grad_norm": 0.017624402418732643, + "learning_rate": 1e-06, + "loss": 0.0347, + "step": 834 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3438268452882767, + "epoch": 2.1973684210526314, + "grad_norm": 0.015222882851958275, + "learning_rate": 1e-06, + "loss": 0.1393, + "step": 835 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.32134924829006195, + "epoch": 2.2, + "grad_norm": 0.017897766083478928, + "learning_rate": 1e-06, + "loss": 0.0976, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13689.0, + "completions/mean_length": 1387.171875, + "completions/mean_terminated_length": 903.4031982421875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.41330593824386597, + "epoch": 2.2026315789473685, + "frac_reward_zero_std": 0.375, + "grad_norm": 17.12291717529297, + "learning_rate": 1e-06, + "loss": 0.1254, + "num_tokens": 260406584.0, + "reward": 0.7749391794204712, + "reward_std": 0.17058953642845154, + "rewards/progression_diversity/mean": -0.020733974874019623, + "rewards/progression_diversity/std": 0.08155296742916107, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.8982747793197632, + "rewards/symbolic_reward_partial_score/std": 0.27264928817749023, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0631152391433716, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 136.99832153320312, + "sampling/sampling_logp_difference/mean": 0.695594310760498, + "step": 837 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.40689145028591156, + "epoch": 2.205263157894737, + "grad_norm": 0.004986094310879707, + "learning_rate": 1e-06, + "loss": 0.1718, + "step": 838 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3304235339164734, + "epoch": 2.207894736842105, + "grad_norm": 0.013745912350714207, + "learning_rate": 1e-06, + "loss": 0.0868, + "step": 839 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.32582440972328186, + "epoch": 2.2105263157894735, + "grad_norm": 0.00617125304415822, + "learning_rate": 1e-06, + "loss": 0.0352, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13147.0, + "completions/mean_length": 1191.048828125, + "completions/mean_terminated_length": 605.5192260742188, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.3709278106689453, + "epoch": 2.213157894736842, + "frac_reward_zero_std": 0.25, + "grad_norm": 21.482324600219727, + "learning_rate": 1e-06, + "loss": 0.1633, + "num_tokens": 261411185.0, + "reward": 0.7811179161071777, + "reward_std": 0.21279799938201904, + "rewards/progression_diversity/mean": -0.018096236512064934, + "rewards/progression_diversity/std": 0.08493653684854507, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.9129231572151184, + "rewards/symbolic_reward_partial_score/std": 0.247787743806839, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0535836219787598, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 139.99432373046875, + "sampling/sampling_logp_difference/mean": 1.1103196144104004, + "step": 841 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3213431090116501, + "epoch": 2.2157894736842105, + "grad_norm": 0.012497864663600922, + "learning_rate": 1e-06, + "loss": 0.0665, + "step": 842 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.3591870814561844, + "epoch": 2.218421052631579, + "grad_norm": 0.010890123434364796, + "learning_rate": 1e-06, + "loss": 0.1304, + "step": 843 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2970151901245117, + "epoch": 2.221052631578947, + "grad_norm": 0.11356399953365326, + "learning_rate": 1e-06, + "loss": 0.0636, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14047.0, + "completions/mean_length": 1340.443359375, + "completions/mean_terminated_length": 568.1868896484375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.33584554493427277, + "epoch": 2.223684210526316, + "frac_reward_zero_std": 0.375, + "grad_norm": 19.238874435424805, + "learning_rate": 1e-06, + "loss": 0.1027, + "num_tokens": 262479924.0, + "reward": 0.7776129245758057, + "reward_std": 0.1951584815979004, + "rewards/progression_diversity/mean": -0.01703125424683094, + "rewards/progression_diversity/std": 0.0720345601439476, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.90380859375, + "rewards/symbolic_reward_partial_score/std": 0.2672818601131439, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0570869445800781, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 143.1820831298828, + "sampling/sampling_logp_difference/mean": 0.9214514493942261, + "step": 845 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3423171490430832, + "epoch": 2.2263157894736842, + "grad_norm": 0.010278112255036831, + "learning_rate": 1e-06, + "loss": 0.1566, + "step": 846 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3419143259525299, + "epoch": 2.2289473684210526, + "grad_norm": 0.011319981887936592, + "learning_rate": 1e-06, + "loss": 0.0796, + "step": 847 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3335099071264267, + "epoch": 2.231578947368421, + "grad_norm": 0.015382968820631504, + "learning_rate": 1e-06, + "loss": 0.0699, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14075.0, + "completions/mean_length": 1389.193359375, + "completions/mean_terminated_length": 811.3001708984375, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.35164597630500793, + "epoch": 2.2342105263157896, + "frac_reward_zero_std": 0.3125, + "grad_norm": 7.93258810043335, + "learning_rate": 1e-06, + "loss": 0.0673, + "num_tokens": 263594679.0, + "reward": 0.7638546228408813, + "reward_std": 0.19747570157051086, + "rewards/progression_diversity/mean": -0.025676485151052475, + "rewards/progression_diversity/std": 0.09972041100263596, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.8953450322151184, + "rewards/symbolic_reward_partial_score/std": 0.2742253541946411, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.053678035736084, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 163.0, + "sampling/sampling_logp_difference/mean": 1.083064317703247, + "step": 849 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3196953535079956, + "epoch": 2.236842105263158, + "grad_norm": 0.17636702954769135, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 850 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.37696947157382965, + "epoch": 2.2394736842105263, + "grad_norm": 0.02072129212319851, + "learning_rate": 1e-06, + "loss": 0.1117, + "step": 851 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3250402361154556, + "epoch": 2.2421052631578946, + "grad_norm": 0.014444287866353989, + "learning_rate": 1e-06, + "loss": 0.1048, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13668.0, + "completions/mean_length": 1304.669921875, + "completions/mean_terminated_length": 627.6387329101562, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.3698349595069885, + "epoch": 2.2447368421052634, + "frac_reward_zero_std": 0.375, + "grad_norm": 217.0945587158203, + "learning_rate": 1e-06, + "loss": 0.1591, + "num_tokens": 264646126.0, + "reward": 0.7918961048126221, + "reward_std": 0.1911715716123581, + "rewards/progression_diversity/mean": -0.019374651834368706, + "rewards/progression_diversity/std": 0.08247331529855728, + "rewards/symbolic_reward_accuracy/mean": 0.873046875, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.9098306894302368, + "rewards/symbolic_reward_partial_score/std": 0.2634342610836029, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0525051355361938, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 169.99769592285156, + "sampling/sampling_logp_difference/mean": 1.494558334350586, + "step": 853 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.30978524684906006, + "epoch": 2.2473684210526317, + "grad_norm": 0.008726145140826702, + "learning_rate": 1e-06, + "loss": 0.0635, + "step": 854 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.32608427107334137, + "epoch": 2.25, + "grad_norm": 0.004246350843459368, + "learning_rate": 1e-06, + "loss": 0.103, + "step": 855 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.308328241109848, + "epoch": 2.2526315789473683, + "grad_norm": 0.005722150672227144, + "learning_rate": 1e-06, + "loss": 0.0967, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13585.0, + "completions/mean_length": 1705.638671875, + "completions/mean_terminated_length": 694.3945922851562, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.29534509778022766, + "epoch": 2.2552631578947366, + "frac_reward_zero_std": 0.15625, + "grad_norm": 196.59950256347656, + "learning_rate": 1e-06, + "loss": 0.1282, + "num_tokens": 265932725.0, + "reward": 0.7359683513641357, + "reward_std": 0.2585596442222595, + "rewards/progression_diversity/mean": -0.031095456331968307, + "rewards/progression_diversity/std": 0.1090308129787445, + "rewards/symbolic_reward_accuracy/mean": 0.80078125, + "rewards/symbolic_reward_accuracy/std": 0.39980348944664, + "rewards/symbolic_reward_partial_score/mean": 0.87548828125, + "rewards/symbolic_reward_partial_score/std": 0.2973478138446808, + "rewards/tag_count_reward/mean": -0.068359375, + "rewards/tag_count_reward/std": 0.25260838866233826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0500712394714355, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 166.0, + "sampling/sampling_logp_difference/mean": 1.9202924966812134, + "step": 857 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.40625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.377376526594162, + "epoch": 2.2578947368421054, + "grad_norm": 0.00950684119015932, + "learning_rate": 1e-06, + "loss": 0.1836, + "step": 858 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.4171202927827835, + "epoch": 2.2605263157894737, + "grad_norm": 41.98322296142578, + "learning_rate": 1e-06, + "loss": 0.2213, + "step": 859 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.3512898087501526, + "epoch": 2.263157894736842, + "grad_norm": 0.01232555229216814, + "learning_rate": 1e-06, + "loss": 0.1466, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11700.0, + "completions/mean_length": 1474.6875, + "completions/mean_terminated_length": 612.165283203125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.3273746818304062, + "epoch": 2.2657894736842104, + "frac_reward_zero_std": 0.1875, + "grad_norm": 47.61751937866211, + "learning_rate": 1e-06, + "loss": 0.1721, + "num_tokens": 267097749.0, + "reward": 0.7948732972145081, + "reward_std": 0.23872321844100952, + "rewards/progression_diversity/mean": -0.02927570417523384, + "rewards/progression_diversity/std": 0.11151636391878128, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.9148763418197632, + "rewards/symbolic_reward_partial_score/std": 0.253930926322937, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0462769269943237, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 157.0, + "sampling/sampling_logp_difference/mean": 1.056786060333252, + "step": 861 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.28751927614212036, + "epoch": 2.268421052631579, + "grad_norm": 0.012664435431361198, + "learning_rate": 1e-06, + "loss": 0.0962, + "step": 862 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3128338009119034, + "epoch": 2.2710526315789474, + "grad_norm": 0.010503551922738552, + "learning_rate": 1e-06, + "loss": 0.1709, + "step": 863 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.299874484539032, + "epoch": 2.2736842105263158, + "grad_norm": 0.009202693589031696, + "learning_rate": 1e-06, + "loss": 0.1373, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12330.0, + "completions/mean_length": 1326.27734375, + "completions/mean_terminated_length": 618.0408935546875, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.29694661498069763, + "epoch": 2.276315789473684, + "frac_reward_zero_std": 0.25, + "grad_norm": 28.95979881286621, + "learning_rate": 1e-06, + "loss": 0.1895, + "num_tokens": 268138083.0, + "reward": 0.7938084602355957, + "reward_std": 0.1921263337135315, + "rewards/progression_diversity/mean": -0.02833309769630432, + "rewards/progression_diversity/std": 0.11551596224308014, + "rewards/symbolic_reward_accuracy/mean": 0.869140625, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.92041015625, + "rewards/symbolic_reward_partial_score/std": 0.24347224831581116, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0430631637573242, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 157.90789794921875, + "sampling/sampling_logp_difference/mean": 0.31643933057785034, + "step": 865 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2519264966249466, + "epoch": 2.2789473684210524, + "grad_norm": 0.0065588075667619705, + "learning_rate": 1e-06, + "loss": 0.0924, + "step": 866 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.260475218296051, + "epoch": 2.281578947368421, + "grad_norm": 0.291818231344223, + "learning_rate": 1e-06, + "loss": 0.0611, + "step": 867 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2639893591403961, + "epoch": 2.2842105263157895, + "grad_norm": 0.010040219873189926, + "learning_rate": 1e-06, + "loss": 0.0681, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10708.0, + "completions/mean_length": 1395.240234375, + "completions/mean_terminated_length": 560.814453125, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.2564142793416977, + "epoch": 2.286842105263158, + "frac_reward_zero_std": 0.34375, + "grad_norm": 28.421417236328125, + "learning_rate": 1e-06, + "loss": 0.2116, + "num_tokens": 269248734.0, + "reward": 0.7830079793930054, + "reward_std": 0.18753179907798767, + "rewards/progression_diversity/mean": -0.03417276591062546, + "rewards/progression_diversity/std": 0.13364149630069733, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.9034830331802368, + "rewards/symbolic_reward_partial_score/std": 0.2679261863231659, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.034959316253662, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 158.98016357421875, + "sampling/sampling_logp_difference/mean": 0.3048059940338135, + "step": 869 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2360822707414627, + "epoch": 2.2894736842105265, + "grad_norm": 0.009712324477732182, + "learning_rate": 1e-06, + "loss": 0.0756, + "step": 870 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2438521683216095, + "epoch": 2.292105263157895, + "grad_norm": 0.008457427844405174, + "learning_rate": 1e-06, + "loss": 0.0852, + "step": 871 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.23070208728313446, + "epoch": 2.294736842105263, + "grad_norm": 0.006932724732905626, + "learning_rate": 1e-06, + "loss": 0.1447, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10716.0, + "completions/mean_length": 1633.05859375, + "completions/mean_terminated_length": 583.8284301757812, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.24190223217010498, + "epoch": 2.2973684210526315, + "frac_reward_zero_std": 0.3125, + "grad_norm": 5.179454326629639, + "learning_rate": 1e-06, + "loss": 0.0743, + "num_tokens": 270490172.0, + "reward": 0.7714011669158936, + "reward_std": 0.20021489262580872, + "rewards/progression_diversity/mean": -0.03762088716030121, + "rewards/progression_diversity/std": 0.13623382151126862, + "rewards/symbolic_reward_accuracy/mean": 0.84375, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.9046223759651184, + "rewards/symbolic_reward_partial_score/std": 0.26515042781829834, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0347652435302734, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 162.99392700195312, + "sampling/sampling_logp_difference/mean": 0.2914007902145386, + "step": 873 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.24344737827777863, + "epoch": 2.3, + "grad_norm": 3.351227045059204, + "learning_rate": 1e-06, + "loss": 0.1364, + "step": 874 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2388121336698532, + "epoch": 2.3026315789473686, + "grad_norm": 0.011248363181948662, + "learning_rate": 1e-06, + "loss": 0.154, + "step": 875 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2308916077017784, + "epoch": 2.305263157894737, + "grad_norm": 0.24132992327213287, + "learning_rate": 1e-06, + "loss": 0.1312, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10811.0, + "completions/mean_length": 1310.69921875, + "completions/mean_terminated_length": 569.3892822265625, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.24580485373735428, + "epoch": 2.307894736842105, + "frac_reward_zero_std": 0.3125, + "grad_norm": 3.5337963104248047, + "learning_rate": 1e-06, + "loss": 0.0719, + "num_tokens": 271567746.0, + "reward": 0.7978794574737549, + "reward_std": 0.2167576551437378, + "rewards/progression_diversity/mean": -0.026513516902923584, + "rewards/progression_diversity/std": 0.11184799671173096, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.9156901240348816, + "rewards/symbolic_reward_partial_score/std": 0.25561559200286865, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0415388345718384, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 161.8677520751953, + "sampling/sampling_logp_difference/mean": 0.3057638108730316, + "step": 877 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2451568767428398, + "epoch": 2.3105263157894735, + "grad_norm": 0.012724120169878006, + "learning_rate": 1e-06, + "loss": 0.0798, + "step": 878 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.24493427574634552, + "epoch": 2.3131578947368423, + "grad_norm": 0.018414990976452827, + "learning_rate": 1e-06, + "loss": 0.1669, + "step": 879 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.24908612668514252, + "epoch": 2.3157894736842106, + "grad_norm": 0.00875798612833023, + "learning_rate": 1e-06, + "loss": 0.138, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11232.0, + "completions/mean_length": 1375.77734375, + "completions/mean_terminated_length": 572.8682861328125, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.2587193250656128, + "epoch": 2.318421052631579, + "frac_reward_zero_std": 0.28125, + "grad_norm": 12.977412223815918, + "learning_rate": 1e-06, + "loss": 0.1116, + "num_tokens": 272652592.0, + "reward": 0.7711009979248047, + "reward_std": 0.20819194614887238, + "rewards/progression_diversity/mean": -0.02857367694377899, + "rewards/progression_diversity/std": 0.11577697843313217, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.9085286259651184, + "rewards/symbolic_reward_partial_score/std": 0.2529752552509308, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0420448780059814, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 166.0, + "sampling/sampling_logp_difference/mean": 0.29961156845092773, + "step": 881 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.26260629296302795, + "epoch": 2.3210526315789473, + "grad_norm": 0.06035393849015236, + "learning_rate": 1e-06, + "loss": 0.1662, + "step": 882 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2572166621685028, + "epoch": 2.3236842105263156, + "grad_norm": 0.01509555708616972, + "learning_rate": 1e-06, + "loss": 0.0984, + "step": 883 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2562931329011917, + "epoch": 2.3263157894736843, + "grad_norm": 0.009722933173179626, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11360.0, + "completions/mean_length": 1769.9296875, + "completions/mean_terminated_length": 531.4491577148438, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.2543962597846985, + "epoch": 2.3289473684210527, + "frac_reward_zero_std": 0.3125, + "grad_norm": 8.712998390197754, + "learning_rate": 1e-06, + "loss": 0.1231, + "num_tokens": 273958092.0, + "reward": 0.7438181042671204, + "reward_std": 0.2017257809638977, + "rewards/progression_diversity/mean": -0.04201960563659668, + "rewards/progression_diversity/std": 0.1431008279323578, + "rewards/symbolic_reward_accuracy/mean": 0.810546875, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.8818359375, + "rewards/symbolic_reward_partial_score/std": 0.29238346219062805, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041464924812317, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 171.0, + "sampling/sampling_logp_difference/mean": 0.3230324387550354, + "step": 885 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.24783600121736526, + "epoch": 2.331578947368421, + "grad_norm": 0.008411634713411331, + "learning_rate": 1e-06, + "loss": 0.0807, + "step": 886 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.24633554369211197, + "epoch": 2.3342105263157893, + "grad_norm": 0.013761989772319794, + "learning_rate": 1e-06, + "loss": 0.1864, + "step": 887 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.09375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2572323977947235, + "epoch": 2.336842105263158, + "grad_norm": 0.011136609129607677, + "learning_rate": 1e-06, + "loss": 0.1732, + "step": 888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15127.0, + "completions/mean_length": 1794.662109375, + "completions/mean_terminated_length": 756.9267578125, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.274614155292511, + "epoch": 2.3394736842105264, + "frac_reward_zero_std": 0.28125, + "grad_norm": 30.201536178588867, + "learning_rate": 1e-06, + "loss": 0.1424, + "num_tokens": 275266591.0, + "reward": 0.7809537649154663, + "reward_std": 0.18957683444023132, + "rewards/progression_diversity/mean": -0.039390042424201965, + "rewards/progression_diversity/std": 0.1323193460702896, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.9039713740348816, + "rewards/symbolic_reward_partial_score/std": 0.26959308981895447, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.047221302986145, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 186.931884765625, + "sampling/sampling_logp_difference/mean": 0.643196702003479, + "step": 889 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2707149535417557, + "epoch": 2.3421052631578947, + "grad_norm": 0.010098773054778576, + "learning_rate": 1e-06, + "loss": 0.1054, + "step": 890 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.28222164511680603, + "epoch": 2.344736842105263, + "grad_norm": 0.3068772554397583, + "learning_rate": 1e-06, + "loss": 0.1973, + "step": 891 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.266815185546875, + "epoch": 2.3473684210526318, + "grad_norm": 1.143034815788269, + "learning_rate": 1e-06, + "loss": 0.126, + "step": 892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12367.0, + "completions/mean_length": 1806.0625, + "completions/mean_terminated_length": 570.64404296875, + "completions/min_length": 222.0, + "completions/min_terminated_length": 222.0, + "entropy": 0.2788756787776947, + "epoch": 2.35, + "frac_reward_zero_std": 0.25, + "grad_norm": 49.109130859375, + "learning_rate": 1e-06, + "loss": 0.192, + "num_tokens": 276565887.0, + "reward": 0.7949280142784119, + "reward_std": 0.20325177907943726, + "rewards/progression_diversity/mean": -0.04333332180976868, + "rewards/progression_diversity/std": 0.14543607831001282, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.91162109375, + "rewards/symbolic_reward_partial_score/std": 0.2654019296169281, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0455005168914795, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 178.99993896484375, + "sampling/sampling_logp_difference/mean": 0.7122431993484497, + "step": 893 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2602469027042389, + "epoch": 2.3526315789473684, + "grad_norm": 0.005726585630327463, + "learning_rate": 1e-06, + "loss": 0.1203, + "step": 894 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2623884826898575, + "epoch": 2.3552631578947367, + "grad_norm": 0.15292230248451233, + "learning_rate": 1e-06, + "loss": 0.0371, + "step": 895 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2809968888759613, + "epoch": 2.3578947368421055, + "grad_norm": 0.005319634452462196, + "learning_rate": 1e-06, + "loss": 0.2592, + "step": 896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14861.0, + "completions/mean_length": 2606.03515625, + "completions/mean_terminated_length": 777.1017456054688, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.28651466965675354, + "epoch": 2.360526315789474, + "frac_reward_zero_std": 0.09375, + "grad_norm": 24.261709213256836, + "learning_rate": 1e-06, + "loss": 0.1616, + "num_tokens": 278317681.0, + "reward": 0.7092536687850952, + "reward_std": 0.2737279534339905, + "rewards/progression_diversity/mean": -0.05608125776052475, + "rewards/progression_diversity/std": 0.1531989425420761, + "rewards/symbolic_reward_accuracy/mean": 0.78125, + "rewards/symbolic_reward_accuracy/std": 0.41380295157432556, + "rewards/symbolic_reward_partial_score/mean": 0.8426106572151184, + "rewards/symbolic_reward_partial_score/std": 0.3354193866252899, + "rewards/tag_count_reward/mean": -0.1171875, + "rewards/tag_count_reward/std": 0.32195815443992615, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045390248298645, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 180.99990844726562, + "sampling/sampling_logp_difference/mean": 0.8015948534011841, + "step": 897 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.30110684037208557, + "epoch": 2.363157894736842, + "grad_norm": 4.408512115478516, + "learning_rate": 1e-06, + "loss": 0.2523, + "step": 898 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2703721821308136, + "epoch": 2.3657894736842104, + "grad_norm": 0.4832998216152191, + "learning_rate": 1e-06, + "loss": 0.1279, + "step": 899 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4140625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.28015589714050293, + "epoch": 2.3684210526315788, + "grad_norm": 0.007808802183717489, + "learning_rate": 1e-06, + "loss": 0.1913, + "step": 900 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12939.0, + "completions/mean_length": 1535.99609375, + "completions/mean_terminated_length": 805.766357421875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.2482433319091797, + "epoch": 2.3710526315789475, + "frac_reward_zero_std": 0.25, + "grad_norm": 15.909863471984863, + "learning_rate": 1e-06, + "loss": 0.0303, + "num_tokens": 279514095.0, + "reward": 0.775747537612915, + "reward_std": 0.22888442873954773, + "rewards/progression_diversity/mean": -0.032673317939043045, + "rewards/progression_diversity/std": 0.12282172590494156, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.9013671875, + "rewards/symbolic_reward_partial_score/std": 0.26712751388549805, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0461652278900146, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 189.7768096923828, + "sampling/sampling_logp_difference/mean": 0.7810826301574707, + "step": 901 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.275927871465683, + "epoch": 2.373684210526316, + "grad_norm": 0.009297163225710392, + "learning_rate": 1e-06, + "loss": 0.1966, + "step": 902 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2581518739461899, + "epoch": 2.376315789473684, + "grad_norm": 0.01285830419510603, + "learning_rate": 1e-06, + "loss": 0.1598, + "step": 903 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.27708950638771057, + "epoch": 2.3789473684210525, + "grad_norm": 0.01762818545103073, + "learning_rate": 1e-06, + "loss": 0.2154, + "step": 904 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12266.0, + "completions/mean_length": 1788.240234375, + "completions/mean_terminated_length": 684.3592529296875, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.2828632742166519, + "epoch": 2.3815789473684212, + "frac_reward_zero_std": 0.1875, + "grad_norm": 28.789213180541992, + "learning_rate": 1e-06, + "loss": 0.1664, + "num_tokens": 280833482.0, + "reward": 0.7678719162940979, + "reward_std": 0.19974274933338165, + "rewards/progression_diversity/mean": -0.034103699028491974, + "rewards/progression_diversity/std": 0.11745806038379669, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.9051106572151184, + "rewards/symbolic_reward_partial_score/std": 0.2643766403198242, + "rewards/tag_count_reward/mean": -0.072265625, + "rewards/tag_count_reward/std": 0.2591804563999176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0446805953979492, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 187.5076446533203, + "sampling/sampling_logp_difference/mean": 0.7177349328994751, + "step": 905 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2618148922920227, + "epoch": 2.3842105263157896, + "grad_norm": 0.5371949672698975, + "learning_rate": 1e-06, + "loss": 0.1672, + "step": 906 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.26834670454263687, + "epoch": 2.386842105263158, + "grad_norm": 0.008967792615294456, + "learning_rate": 1e-06, + "loss": 0.0927, + "step": 907 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.2814731001853943, + "epoch": 2.389473684210526, + "grad_norm": 1.3082025051116943, + "learning_rate": 1e-06, + "loss": 0.1104, + "step": 908 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11597.0, + "completions/mean_length": 2223.818359375, + "completions/mean_terminated_length": 826.0321655273438, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.26080475747585297, + "epoch": 2.3921052631578945, + "frac_reward_zero_std": 0.15625, + "grad_norm": 56.598018646240234, + "learning_rate": 1e-06, + "loss": 0.1662, + "num_tokens": 282405421.0, + "reward": 0.7159743309020996, + "reward_std": 0.26898401975631714, + "rewards/progression_diversity/mean": -0.04807615280151367, + "rewards/progression_diversity/std": 0.13962800800800323, + "rewards/symbolic_reward_accuracy/mean": 0.783203125, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.8575846552848816, + "rewards/symbolic_reward_partial_score/std": 0.31706979870796204, + "rewards/tag_count_reward/mean": -0.107421875, + "rewards/tag_count_reward/std": 0.30995169281959534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0462749004364014, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 186.0, + "sampling/sampling_logp_difference/mean": 0.7766126990318298, + "step": 909 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.27692703902721405, + "epoch": 2.3947368421052633, + "grad_norm": 3.8725080490112305, + "learning_rate": 1e-06, + "loss": 0.1265, + "step": 910 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2723271772265434, + "epoch": 2.3973684210526316, + "grad_norm": 3.388723611831665, + "learning_rate": 1e-06, + "loss": 0.1184, + "step": 911 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2788117825984955, + "epoch": 2.4, + "grad_norm": 0.0120708541944623, + "learning_rate": 1e-06, + "loss": 0.2389, + "step": 912 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11453.0, + "completions/mean_length": 1910.248046875, + "completions/mean_terminated_length": 616.8489379882812, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.2541096359491348, + "epoch": 2.4026315789473682, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.020459089428186417, + "learning_rate": 1e-06, + "loss": 0.0879, + "num_tokens": 283807468.0, + "reward": 0.7370096445083618, + "reward_std": 0.23636674880981445, + "rewards/progression_diversity/mean": -0.03927619010210037, + "rewards/progression_diversity/std": 0.13120803236961365, + "rewards/symbolic_reward_accuracy/mean": 0.802734375, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.8798828125, + "rewards/symbolic_reward_partial_score/std": 0.28891611099243164, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0478408336639404, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 186.0, + "sampling/sampling_logp_difference/mean": 0.6119551658630371, + "step": 913 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.28826163709163666, + "epoch": 2.405263157894737, + "grad_norm": 10.523284912109375, + "learning_rate": 1e-06, + "loss": 0.2367, + "step": 914 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2593996897339821, + "epoch": 2.4078947368421053, + "grad_norm": 0.012641196139156818, + "learning_rate": 1e-06, + "loss": 0.1789, + "step": 915 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2622472196817398, + "epoch": 2.4105263157894736, + "grad_norm": 0.012980838306248188, + "learning_rate": 1e-06, + "loss": 0.0986, + "step": 916 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10840.0, + "completions/mean_length": 1772.17578125, + "completions/mean_terminated_length": 667.0798950195312, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.2561643496155739, + "epoch": 2.413157894736842, + "frac_reward_zero_std": 0.25, + "grad_norm": 34.25957489013672, + "learning_rate": 1e-06, + "loss": 0.0891, + "num_tokens": 285117126.0, + "reward": 0.7429047226905823, + "reward_std": 0.22891083359718323, + "rewards/progression_diversity/mean": -0.040586501359939575, + "rewards/progression_diversity/std": 0.14015312492847443, + "rewards/symbolic_reward_accuracy/mean": 0.810546875, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.8826497793197632, + "rewards/symbolic_reward_partial_score/std": 0.28473344445228577, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0470426082611084, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 185.0, + "sampling/sampling_logp_difference/mean": 0.5546722412109375, + "step": 917 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2671400457620621, + "epoch": 2.4157894736842107, + "grad_norm": 0.01011139526963234, + "learning_rate": 1e-06, + "loss": 0.1876, + "step": 918 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.25694411993026733, + "epoch": 2.418421052631579, + "grad_norm": 0.011605838313698769, + "learning_rate": 1e-06, + "loss": 0.1389, + "step": 919 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2727016508579254, + "epoch": 2.4210526315789473, + "grad_norm": 1.0605076551437378, + "learning_rate": 1e-06, + "loss": 0.167, + "step": 920 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11618.0, + "completions/mean_length": 1737.193359375, + "completions/mean_terminated_length": 728.1231689453125, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.26561206579208374, + "epoch": 2.4236842105263157, + "frac_reward_zero_std": 0.25, + "grad_norm": 114.35115814208984, + "learning_rate": 1e-06, + "loss": 0.1868, + "num_tokens": 286425737.0, + "reward": 0.7702474594116211, + "reward_std": 0.19327302277088165, + "rewards/progression_diversity/mean": -0.03580557554960251, + "rewards/progression_diversity/std": 0.12353134900331497, + "rewards/symbolic_reward_accuracy/mean": 0.84375, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.9000650644302368, + "rewards/symbolic_reward_partial_score/std": 0.2719910442829132, + "rewards/tag_count_reward/mean": -0.056640625, + "rewards/tag_count_reward/std": 0.23138070106506348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0514814853668213, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 187.99998474121094, + "sampling/sampling_logp_difference/mean": 0.6879295110702515, + "step": 921 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.2657042294740677, + "epoch": 2.4263157894736844, + "grad_norm": 4.603257179260254, + "learning_rate": 1e-06, + "loss": 0.0912, + "step": 922 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2845795154571533, + "epoch": 2.4289473684210527, + "grad_norm": 0.0070864069275557995, + "learning_rate": 1e-06, + "loss": 0.1715, + "step": 923 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.27537743747234344, + "epoch": 2.431578947368421, + "grad_norm": 0.988322377204895, + "learning_rate": 1e-06, + "loss": 0.1607, + "step": 924 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11161.0, + "completions/mean_length": 1345.548828125, + "completions/mean_terminated_length": 605.9528198242188, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.25172994285821915, + "epoch": 2.4342105263157894, + "frac_reward_zero_std": 0.4375, + "grad_norm": 17.042478561401367, + "learning_rate": 1e-06, + "loss": 0.1133, + "num_tokens": 287528290.0, + "reward": 0.8133134841918945, + "reward_std": 0.15900012850761414, + "rewards/progression_diversity/mean": -0.030958127230405807, + "rewards/progression_diversity/std": 0.12952972948551178, + "rewards/symbolic_reward_accuracy/mean": 0.8984375, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.9288737177848816, + "rewards/symbolic_reward_partial_score/std": 0.23390844464302063, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.050032615661621, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 187.0, + "sampling/sampling_logp_difference/mean": 0.4537726044654846, + "step": 925 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.25580619275569916, + "epoch": 2.4368421052631577, + "grad_norm": 0.008091673254966736, + "learning_rate": 1e-06, + "loss": 0.1041, + "step": 926 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2634577751159668, + "epoch": 2.4394736842105265, + "grad_norm": 0.019041819497942924, + "learning_rate": 1e-06, + "loss": 0.1278, + "step": 927 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2495371326804161, + "epoch": 2.442105263157895, + "grad_norm": 0.00901293195784092, + "learning_rate": 1e-06, + "loss": 0.1001, + "step": 928 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10453.0, + "completions/mean_length": 2109.220703125, + "completions/mean_terminated_length": 598.4989013671875, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.26799434423446655, + "epoch": 2.444736842105263, + "frac_reward_zero_std": 0.21875, + "grad_norm": 14.625404357910156, + "learning_rate": 1e-06, + "loss": 0.1196, + "num_tokens": 289010675.0, + "reward": 0.7222642302513123, + "reward_std": 0.21547727286815643, + "rewards/progression_diversity/mean": -0.04896814748644829, + "rewards/progression_diversity/std": 0.15030057728290558, + "rewards/symbolic_reward_accuracy/mean": 0.783203125, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.8688151240348816, + "rewards/symbolic_reward_partial_score/std": 0.29783180356025696, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0500848293304443, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 184.99998474121094, + "sampling/sampling_logp_difference/mean": 0.47443127632141113, + "step": 929 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.26960813999176025, + "epoch": 2.4473684210526314, + "grad_norm": 1.1039235591888428, + "learning_rate": 1e-06, + "loss": 0.1318, + "step": 930 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.26754334568977356, + "epoch": 2.45, + "grad_norm": 0.5074542164802551, + "learning_rate": 1e-06, + "loss": 0.126, + "step": 931 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2775499224662781, + "epoch": 2.4526315789473685, + "grad_norm": 0.007839949801564217, + "learning_rate": 1e-06, + "loss": 0.1686, + "step": 932 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11585.0, + "completions/mean_length": 2094.462890625, + "completions/mean_terminated_length": 683.90771484375, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.2719879597425461, + "epoch": 2.455263157894737, + "frac_reward_zero_std": 0.125, + "grad_norm": 22.145875930786133, + "learning_rate": 1e-06, + "loss": 0.1494, + "num_tokens": 290511872.0, + "reward": 0.7216647267341614, + "reward_std": 0.2439390867948532, + "rewards/progression_diversity/mean": -0.05032936483621597, + "rewards/progression_diversity/std": 0.15467680990695953, + "rewards/symbolic_reward_accuracy/mean": 0.78515625, + "rewards/symbolic_reward_accuracy/std": 0.4111155867576599, + "rewards/symbolic_reward_partial_score/mean": 0.8629556894302368, + "rewards/symbolic_reward_partial_score/std": 0.29924580454826355, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0494955778121948, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 185.0, + "sampling/sampling_logp_difference/mean": 0.5632617473602295, + "step": 933 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2774350792169571, + "epoch": 2.457894736842105, + "grad_norm": 9.17587947845459, + "learning_rate": 1e-06, + "loss": 0.1804, + "step": 934 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2561372071504593, + "epoch": 2.4605263157894735, + "grad_norm": 0.012559008784592152, + "learning_rate": 1e-06, + "loss": 0.1994, + "step": 935 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.27159862220287323, + "epoch": 2.463157894736842, + "grad_norm": 0.4640989601612091, + "learning_rate": 1e-06, + "loss": 0.1847, + "step": 936 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9495.0, + "completions/mean_length": 1562.392578125, + "completions/mean_terminated_length": 508.1359558105469, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.24451758712530136, + "epoch": 2.4657894736842105, + "frac_reward_zero_std": 0.28125, + "grad_norm": 7.657289505004883, + "learning_rate": 1e-06, + "loss": 0.1219, + "num_tokens": 291718473.0, + "reward": 0.7819766998291016, + "reward_std": 0.22530367970466614, + "rewards/progression_diversity/mean": -0.03475029766559601, + "rewards/progression_diversity/std": 0.13393589854240417, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.9059244394302368, + "rewards/symbolic_reward_partial_score/std": 0.2685166001319885, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0499627590179443, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 183.99996948242188, + "sampling/sampling_logp_difference/mean": 0.395052433013916, + "step": 937 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.26470696926116943, + "epoch": 2.468421052631579, + "grad_norm": 14.9205961227417, + "learning_rate": 1e-06, + "loss": 0.1575, + "step": 938 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.25287581980228424, + "epoch": 2.4710526315789476, + "grad_norm": 0.30867093801498413, + "learning_rate": 1e-06, + "loss": 0.0946, + "step": 939 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2676776349544525, + "epoch": 2.473684210526316, + "grad_norm": 0.011630654335021973, + "learning_rate": 1e-06, + "loss": 0.164, + "step": 940 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11695.0, + "completions/mean_length": 1605.857421875, + "completions/mean_terminated_length": 620.64794921875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.2720135450363159, + "epoch": 2.4763157894736842, + "frac_reward_zero_std": 0.25, + "grad_norm": 22.873489379882812, + "learning_rate": 1e-06, + "loss": 0.1462, + "num_tokens": 292933632.0, + "reward": 0.7649558186531067, + "reward_std": 0.22175286710262299, + "rewards/progression_diversity/mean": -0.04250683635473251, + "rewards/progression_diversity/std": 0.15298283100128174, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.90283203125, + "rewards/symbolic_reward_partial_score/std": 0.2671821117401123, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0548969507217407, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 186.0, + "sampling/sampling_logp_difference/mean": 0.43102023005485535, + "step": 941 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2624838799238205, + "epoch": 2.4789473684210526, + "grad_norm": 0.00974233727902174, + "learning_rate": 1e-06, + "loss": 0.0969, + "step": 942 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2723797559738159, + "epoch": 2.481578947368421, + "grad_norm": 0.013580858707427979, + "learning_rate": 1e-06, + "loss": 0.1871, + "step": 943 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.27662527561187744, + "epoch": 2.4842105263157896, + "grad_norm": 0.8762795329093933, + "learning_rate": 1e-06, + "loss": 0.1239, + "step": 944 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12230.0, + "completions/mean_length": 2296.412109375, + "completions/mean_terminated_length": 669.74072265625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.2892232984304428, + "epoch": 2.486842105263158, + "frac_reward_zero_std": 0.125, + "grad_norm": 16.07674217224121, + "learning_rate": 1e-06, + "loss": 0.3197, + "num_tokens": 294500691.0, + "reward": 0.721849799156189, + "reward_std": 0.270133912563324, + "rewards/progression_diversity/mean": -0.061112575232982635, + "rewards/progression_diversity/std": 0.17030061781406403, + "rewards/symbolic_reward_accuracy/mean": 0.79296875, + "rewards/symbolic_reward_accuracy/std": 0.40557438135147095, + "rewards/symbolic_reward_partial_score/mean": 0.8528646230697632, + "rewards/symbolic_reward_partial_score/std": 0.327158123254776, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061535120010376, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 182.99998474121094, + "sampling/sampling_logp_difference/mean": 0.5773719549179077, + "step": 945 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2695477306842804, + "epoch": 2.4894736842105263, + "grad_norm": 0.016710912808775902, + "learning_rate": 1e-06, + "loss": 0.1135, + "step": 946 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.2837250232696533, + "epoch": 2.4921052631578946, + "grad_norm": 2.493502616882324, + "learning_rate": 1e-06, + "loss": 0.1442, + "step": 947 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.28005383908748627, + "epoch": 2.4947368421052634, + "grad_norm": 1.3186397552490234, + "learning_rate": 1e-06, + "loss": 0.2604, + "step": 948 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12417.0, + "completions/mean_length": 2161.0078125, + "completions/mean_terminated_length": 621.7229614257812, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.2822905331850052, + "epoch": 2.4973684210526317, + "frac_reward_zero_std": 0.28125, + "grad_norm": 14.119881629943848, + "learning_rate": 1e-06, + "loss": 0.1365, + "num_tokens": 296000247.0, + "reward": 0.7538953423500061, + "reward_std": 0.20081034302711487, + "rewards/progression_diversity/mean": -0.04991994798183441, + "rewards/progression_diversity/std": 0.15201660990715027, + "rewards/symbolic_reward_accuracy/mean": 0.83203125, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.8811848759651184, + "rewards/symbolic_reward_partial_score/std": 0.3015502393245697, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0665756464004517, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 187.9999542236328, + "sampling/sampling_logp_difference/mean": 0.5656195282936096, + "step": 949 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.28284990787506104, + "epoch": 2.5, + "grad_norm": 2.7521109580993652, + "learning_rate": 1e-06, + "loss": 0.118, + "step": 950 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.2816344350576401, + "epoch": 2.5026315789473683, + "grad_norm": 2.637901544570923, + "learning_rate": 1e-06, + "loss": 0.1765, + "step": 951 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2915927320718765, + "epoch": 2.5052631578947366, + "grad_norm": 0.009813666343688965, + "learning_rate": 1e-06, + "loss": 0.1735, + "step": 952 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10413.0, + "completions/mean_length": 1986.9765625, + "completions/mean_terminated_length": 565.8111572265625, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.26933328807353973, + "epoch": 2.5078947368421054, + "frac_reward_zero_std": 0.3125, + "grad_norm": 31.982301712036133, + "learning_rate": 1e-06, + "loss": 0.1037, + "num_tokens": 297408875.0, + "reward": 0.7207555770874023, + "reward_std": 0.19293510913848877, + "rewards/progression_diversity/mean": -0.04358455538749695, + "rewards/progression_diversity/std": 0.14303399622440338, + "rewards/symbolic_reward_accuracy/mean": 0.783203125, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.8675130605697632, + "rewards/symbolic_reward_partial_score/std": 0.29670462012290955, + "rewards/tag_count_reward/mean": -0.08984375, + "rewards/tag_count_reward/std": 0.2862374484539032, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059614658355713, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 190.99974060058594, + "sampling/sampling_logp_difference/mean": 0.5123655200004578, + "step": 953 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2840705066919327, + "epoch": 2.5105263157894737, + "grad_norm": 9.22028923034668, + "learning_rate": 1e-06, + "loss": 0.2415, + "step": 954 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2771739959716797, + "epoch": 2.513157894736842, + "grad_norm": 1.641637921333313, + "learning_rate": 1e-06, + "loss": 0.1161, + "step": 955 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.27851031720638275, + "epoch": 2.515789473684211, + "grad_norm": 2.4652910232543945, + "learning_rate": 1e-06, + "loss": 0.1288, + "step": 956 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13128.0, + "completions/mean_length": 1963.44921875, + "completions/mean_terminated_length": 674.8042602539062, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.2786627262830734, + "epoch": 2.518421052631579, + "frac_reward_zero_std": 0.34375, + "grad_norm": 18.040977478027344, + "learning_rate": 1e-06, + "loss": 0.0985, + "num_tokens": 298796273.0, + "reward": 0.7595969438552856, + "reward_std": 0.18789124488830566, + "rewards/progression_diversity/mean": -0.041288819164037704, + "rewards/progression_diversity/std": 0.13274167478084564, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.88623046875, + "rewards/symbolic_reward_partial_score/std": 0.29010042548179626, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.061213493347168, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 190.0, + "sampling/sampling_logp_difference/mean": 0.4693317413330078, + "step": 957 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2739100754261017, + "epoch": 2.5210526315789474, + "grad_norm": 0.009779985062777996, + "learning_rate": 1e-06, + "loss": 0.1521, + "step": 958 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.295116662979126, + "epoch": 2.5236842105263158, + "grad_norm": 0.010075357742607594, + "learning_rate": 1e-06, + "loss": 0.1778, + "step": 959 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.27153341472148895, + "epoch": 2.526315789473684, + "grad_norm": 0.01223987527191639, + "learning_rate": 1e-06, + "loss": 0.1236, + "step": 960 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14244.0, + "completions/mean_length": 2309.912109375, + "completions/mean_terminated_length": 786.742431640625, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.30197782814502716, + "epoch": 2.5289473684210524, + "frac_reward_zero_std": 0.15625, + "grad_norm": 25.759937286376953, + "learning_rate": 1e-06, + "loss": 0.1054, + "num_tokens": 300395268.0, + "reward": 0.6984215378761292, + "reward_std": 0.22170159220695496, + "rewards/progression_diversity/mean": -0.0504293292760849, + "rewards/progression_diversity/std": 0.14640994369983673, + "rewards/symbolic_reward_accuracy/mean": 0.755859375, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.8512369394302368, + "rewards/symbolic_reward_partial_score/std": 0.3122048079967499, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0820285081863403, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 189.9932861328125, + "sampling/sampling_logp_difference/mean": 0.6296501159667969, + "step": 961 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.31053292751312256, + "epoch": 2.531578947368421, + "grad_norm": 0.010956598445773125, + "learning_rate": 1e-06, + "loss": 0.197, + "step": 962 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3268131613731384, + "epoch": 2.5342105263157895, + "grad_norm": 4.867886543273926, + "learning_rate": 1e-06, + "loss": 0.1752, + "step": 963 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.29976852238178253, + "epoch": 2.536842105263158, + "grad_norm": 0.28594204783439636, + "learning_rate": 1e-06, + "loss": 0.1525, + "step": 964 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14171.0, + "completions/mean_length": 1403.908203125, + "completions/mean_terminated_length": 667.1823120117188, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.3206019401550293, + "epoch": 2.5394736842105265, + "frac_reward_zero_std": 0.34375, + "grad_norm": 21.9819393157959, + "learning_rate": 1e-06, + "loss": 0.1878, + "num_tokens": 301511541.0, + "reward": 0.8047443628311157, + "reward_std": 0.16195279359817505, + "rewards/progression_diversity/mean": -0.028497815132141113, + "rewards/progression_diversity/std": 0.1169213354587555, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9347330331802368, + "rewards/symbolic_reward_partial_score/std": 0.21818473935127258, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0805866718292236, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 188.99998474121094, + "sampling/sampling_logp_difference/mean": 0.5423262119293213, + "step": 965 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.28471747040748596, + "epoch": 2.542105263157895, + "grad_norm": 0.39148885011672974, + "learning_rate": 1e-06, + "loss": 0.0897, + "step": 966 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.27406057715415955, + "epoch": 2.544736842105263, + "grad_norm": 0.004555261693894863, + "learning_rate": 1e-06, + "loss": 0.0543, + "step": 967 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.29411862790584564, + "epoch": 2.5473684210526315, + "grad_norm": 0.011435529217123985, + "learning_rate": 1e-06, + "loss": 0.1264, + "step": 968 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13752.0, + "completions/mean_length": 1564.11328125, + "completions/mean_terminated_length": 641.7137451171875, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.31351859867572784, + "epoch": 2.55, + "frac_reward_zero_std": 0.375, + "grad_norm": 25.21331787109375, + "learning_rate": 1e-06, + "loss": 0.1539, + "num_tokens": 302711503.0, + "reward": 0.7893426418304443, + "reward_std": 0.18001574277877808, + "rewards/progression_diversity/mean": -0.03058181144297123, + "rewards/progression_diversity/std": 0.11552340537309647, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.9108072519302368, + "rewards/symbolic_reward_partial_score/std": 0.25809013843536377, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0829228162765503, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 189.0, + "sampling/sampling_logp_difference/mean": 0.5267312526702881, + "step": 969 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2877921462059021, + "epoch": 2.5526315789473686, + "grad_norm": 0.15190419554710388, + "learning_rate": 1e-06, + "loss": 0.0471, + "step": 970 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.30551964044570923, + "epoch": 2.555263157894737, + "grad_norm": 0.005192040465772152, + "learning_rate": 1e-06, + "loss": 0.1395, + "step": 971 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.29702240228652954, + "epoch": 2.557894736842105, + "grad_norm": 0.016318701207637787, + "learning_rate": 1e-06, + "loss": 0.0983, + "step": 972 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11243.0, + "completions/mean_length": 1504.9765625, + "completions/mean_terminated_length": 578.8963012695312, + "completions/min_length": 247.0, + "completions/min_terminated_length": 247.0, + "entropy": 0.27735981345176697, + "epoch": 2.5605263157894735, + "frac_reward_zero_std": 0.34375, + "grad_norm": 5.76010274887085, + "learning_rate": 1e-06, + "loss": 0.0696, + "num_tokens": 303875171.0, + "reward": 0.7819315195083618, + "reward_std": 0.1941755712032318, + "rewards/progression_diversity/mean": -0.029511984437704086, + "rewards/progression_diversity/std": 0.11651825904846191, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.908203125, + "rewards/symbolic_reward_partial_score/std": 0.25990694761276245, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0842504501342773, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 192.0, + "sampling/sampling_logp_difference/mean": 0.5295515060424805, + "step": 973 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.33016660809516907, + "epoch": 2.5631578947368423, + "grad_norm": 0.0073794652707874775, + "learning_rate": 1e-06, + "loss": 0.201, + "step": 974 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.287723571062088, + "epoch": 2.5657894736842106, + "grad_norm": 0.01062384806573391, + "learning_rate": 1e-06, + "loss": 0.139, + "step": 975 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2862458676099777, + "epoch": 2.568421052631579, + "grad_norm": 0.01843968592584133, + "learning_rate": 1e-06, + "loss": 0.0968, + "step": 976 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13225.0, + "completions/mean_length": 1550.58203125, + "completions/mean_terminated_length": 627.3402709960938, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.30991949141025543, + "epoch": 2.5710526315789473, + "frac_reward_zero_std": 0.3125, + "grad_norm": 35.655460357666016, + "learning_rate": 1e-06, + "loss": 0.1447, + "num_tokens": 305055693.0, + "reward": 0.7691850662231445, + "reward_std": 0.20995613932609558, + "rewards/progression_diversity/mean": -0.02974330447614193, + "rewards/progression_diversity/std": 0.1124248057603836, + "rewards/symbolic_reward_accuracy/mean": 0.84375, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.8976237177848816, + "rewards/symbolic_reward_partial_score/std": 0.27150553464889526, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0818887948989868, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 199.03089904785156, + "sampling/sampling_logp_difference/mean": 0.4938344359397888, + "step": 977 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2864755541086197, + "epoch": 2.5736842105263156, + "grad_norm": 0.02253652736544609, + "learning_rate": 1e-06, + "loss": 0.0692, + "step": 978 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.28972767293453217, + "epoch": 2.5763157894736843, + "grad_norm": 0.007194779813289642, + "learning_rate": 1e-06, + "loss": 0.1079, + "step": 979 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.30334651470184326, + "epoch": 2.5789473684210527, + "grad_norm": 0.007408217992633581, + "learning_rate": 1e-06, + "loss": 0.1557, + "step": 980 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13332.0, + "completions/mean_length": 1399.427734375, + "completions/mean_terminated_length": 565.2350463867188, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.2880914658308029, + "epoch": 2.581578947368421, + "frac_reward_zero_std": 0.3125, + "grad_norm": 79.71167755126953, + "learning_rate": 1e-06, + "loss": 0.1494, + "num_tokens": 306174184.0, + "reward": 0.8039476275444031, + "reward_std": 0.21047255396842957, + "rewards/progression_diversity/mean": -0.025160329416394234, + "rewards/progression_diversity/std": 0.10529939085245132, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9241536855697632, + "rewards/symbolic_reward_partial_score/std": 0.23598654568195343, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0879991054534912, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 199.0, + "sampling/sampling_logp_difference/mean": 0.5080787539482117, + "step": 981 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2747099846601486, + "epoch": 2.5842105263157897, + "grad_norm": 0.09463394433259964, + "learning_rate": 1e-06, + "loss": 0.0628, + "step": 982 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.29164865612983704, + "epoch": 2.586842105263158, + "grad_norm": 0.016119860112667084, + "learning_rate": 1e-06, + "loss": 0.1221, + "step": 983 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.3027169704437256, + "epoch": 2.5894736842105264, + "grad_norm": 0.01110443938523531, + "learning_rate": 1e-06, + "loss": 0.2168, + "step": 984 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14648.0, + "completions/mean_length": 1692.560546875, + "completions/mean_terminated_length": 614.5723266601562, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.30789491534233093, + "epoch": 2.5921052631578947, + "frac_reward_zero_std": 0.1875, + "grad_norm": 100.55760192871094, + "learning_rate": 1e-06, + "loss": 0.196, + "num_tokens": 307458599.0, + "reward": 0.7546229362487793, + "reward_std": 0.2402782291173935, + "rewards/progression_diversity/mean": -0.03575975075364113, + "rewards/progression_diversity/std": 0.12710198760032654, + "rewards/symbolic_reward_accuracy/mean": 0.8203125, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.8974609375, + "rewards/symbolic_reward_partial_score/std": 0.26585423946380615, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0941245555877686, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 200.0, + "sampling/sampling_logp_difference/mean": 0.5682839751243591, + "step": 985 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3354620784521103, + "epoch": 2.594736842105263, + "grad_norm": 0.8639695644378662, + "learning_rate": 1e-06, + "loss": 0.2717, + "step": 986 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.27841542661190033, + "epoch": 2.5973684210526313, + "grad_norm": 0.023379117250442505, + "learning_rate": 1e-06, + "loss": 0.105, + "step": 987 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2825475037097931, + "epoch": 2.6, + "grad_norm": 0.06907982379198074, + "learning_rate": 1e-06, + "loss": 0.0687, + "step": 988 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13308.0, + "completions/mean_length": 1544.58203125, + "completions/mean_terminated_length": 588.1954345703125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.3159325271844864, + "epoch": 2.6026315789473684, + "frac_reward_zero_std": 0.375, + "grad_norm": 10.981948852539062, + "learning_rate": 1e-06, + "loss": 0.108, + "num_tokens": 308619185.0, + "reward": 0.7951602935791016, + "reward_std": 0.1814492791891098, + "rewards/progression_diversity/mean": -0.03475700318813324, + "rewards/progression_diversity/std": 0.13058389723300934, + "rewards/symbolic_reward_accuracy/mean": 0.876953125, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.9205728769302368, + "rewards/symbolic_reward_partial_score/std": 0.24516622722148895, + "rewards/tag_count_reward/mean": -0.068359375, + "rewards/tag_count_reward/std": 0.25260838866233826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.087169885635376, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 207.0, + "sampling/sampling_logp_difference/mean": 0.5363781452178955, + "step": 989 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.292761892080307, + "epoch": 2.6052631578947367, + "grad_norm": 0.7690657377243042, + "learning_rate": 1e-06, + "loss": 0.0832, + "step": 990 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.30266323685646057, + "epoch": 2.6078947368421055, + "grad_norm": 0.011311556212604046, + "learning_rate": 1e-06, + "loss": 0.1176, + "step": 991 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30019472539424896, + "epoch": 2.610526315789474, + "grad_norm": 0.01206042617559433, + "learning_rate": 1e-06, + "loss": 0.1639, + "step": 992 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14076.0, + "completions/mean_length": 1671.501953125, + "completions/mean_terminated_length": 657.9060668945312, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.3192198872566223, + "epoch": 2.613157894736842, + "frac_reward_zero_std": 0.25, + "grad_norm": 54.62965393066406, + "learning_rate": 1e-06, + "loss": 0.2534, + "num_tokens": 309873970.0, + "reward": 0.763299822807312, + "reward_std": 0.21089032292366028, + "rewards/progression_diversity/mean": -0.04208873584866524, + "rewards/progression_diversity/std": 0.14815421402454376, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.9012044072151184, + "rewards/symbolic_reward_partial_score/std": 0.26881706714630127, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1019631624221802, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 211.0, + "sampling/sampling_logp_difference/mean": 0.8787015676498413, + "step": 993 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3022729456424713, + "epoch": 2.6157894736842104, + "grad_norm": 0.012045558542013168, + "learning_rate": 1e-06, + "loss": 0.1114, + "step": 994 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.298090398311615, + "epoch": 2.6184210526315788, + "grad_norm": 0.007539027836173773, + "learning_rate": 1e-06, + "loss": 0.1346, + "step": 995 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2837032973766327, + "epoch": 2.6210526315789475, + "grad_norm": 0.5139919519424438, + "learning_rate": 1e-06, + "loss": 0.1695, + "step": 996 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13815.0, + "completions/mean_length": 2134.458984375, + "completions/mean_terminated_length": 626.408203125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.28592349588871, + "epoch": 2.623684210526316, + "frac_reward_zero_std": 0.125, + "grad_norm": 19.418561935424805, + "learning_rate": 1e-06, + "loss": 0.0376, + "num_tokens": 311386141.0, + "reward": 0.7048972845077515, + "reward_std": 0.2682965099811554, + "rewards/progression_diversity/mean": -0.05226554721593857, + "rewards/progression_diversity/std": 0.15609407424926758, + "rewards/symbolic_reward_accuracy/mean": 0.7578125, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.8663736581802368, + "rewards/symbolic_reward_partial_score/std": 0.2934032678604126, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.119614601135254, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 214.0, + "sampling/sampling_logp_difference/mean": 1.162545919418335, + "step": 997 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.32562409341335297, + "epoch": 2.626315789473684, + "grad_norm": 0.011620646342635155, + "learning_rate": 1e-06, + "loss": 0.2374, + "step": 998 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3074010759592056, + "epoch": 2.6289473684210525, + "grad_norm": 33.348323822021484, + "learning_rate": 1e-06, + "loss": 0.1771, + "step": 999 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.2109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.31934235990047455, + "epoch": 2.6315789473684212, + "grad_norm": 0.009514860808849335, + "learning_rate": 1e-06, + "loss": 0.315, + "step": 1000 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.107421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13780.0, + "completions/mean_length": 2376.53125, + "completions/mean_terminated_length": 690.7308349609375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.30572909116744995, + "epoch": 2.6342105263157896, + "frac_reward_zero_std": 0.1875, + "grad_norm": 94.71639251708984, + "learning_rate": 1e-06, + "loss": 0.2104, + "num_tokens": 312988045.0, + "reward": 0.7043373584747314, + "reward_std": 0.23734444379806519, + "rewards/progression_diversity/mean": -0.05454763025045395, + "rewards/progression_diversity/std": 0.15527451038360596, + "rewards/symbolic_reward_accuracy/mean": 0.76953125, + "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, + "rewards/symbolic_reward_partial_score/mean": 0.8483072519302368, + "rewards/symbolic_reward_partial_score/std": 0.3243545591831207, + "rewards/tag_count_reward/mean": -0.11328125, + "rewards/tag_count_reward/std": 0.3172462284564972, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1103978157043457, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 215.0, + "sampling/sampling_logp_difference/mean": 0.8874210119247437, + "step": 1001 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.29766781628131866, + "epoch": 2.636842105263158, + "grad_norm": 1.0701483488082886, + "learning_rate": 1e-06, + "loss": 0.1461, + "step": 1002 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.29889827966690063, + "epoch": 2.639473684210526, + "grad_norm": 0.014587471261620522, + "learning_rate": 1e-06, + "loss": 0.1481, + "step": 1003 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.33510279655456543, + "epoch": 2.6421052631578945, + "grad_norm": 0.6961768269538879, + "learning_rate": 1e-06, + "loss": 0.2699, + "step": 1004 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13484.0, + "completions/mean_length": 1625.13671875, + "completions/mean_terminated_length": 575.3430786132812, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.29443803429603577, + "epoch": 2.6447368421052633, + "frac_reward_zero_std": 0.21875, + "grad_norm": 39.419071197509766, + "learning_rate": 1e-06, + "loss": 0.1312, + "num_tokens": 314233267.0, + "reward": 0.7752651572227478, + "reward_std": 0.22436900436878204, + "rewards/progression_diversity/mean": -0.0369657427072525, + "rewards/progression_diversity/std": 0.13464143872261047, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.91162109375, + "rewards/symbolic_reward_partial_score/std": 0.2517907917499542, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1153823137283325, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 214.0, + "sampling/sampling_logp_difference/mean": 0.7468029260635376, + "step": 1005 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.31393955647945404, + "epoch": 2.6473684210526316, + "grad_norm": 49.80437469482422, + "learning_rate": 1e-06, + "loss": 0.2599, + "step": 1006 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2889316976070404, + "epoch": 2.65, + "grad_norm": 2.7875256538391113, + "learning_rate": 1e-06, + "loss": 0.0751, + "step": 1007 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.305808961391449, + "epoch": 2.6526315789473687, + "grad_norm": 3.0765891075134277, + "learning_rate": 1e-06, + "loss": 0.2263, + "step": 1008 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14044.0, + "completions/mean_length": 1598.720703125, + "completions/mean_terminated_length": 678.4751586914062, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.30450139939785004, + "epoch": 2.655263157894737, + "frac_reward_zero_std": 0.125, + "grad_norm": 31.50531578063965, + "learning_rate": 1e-06, + "loss": 0.1628, + "num_tokens": 315446116.0, + "reward": 0.7807782292366028, + "reward_std": 0.25463420152664185, + "rewards/progression_diversity/mean": -0.037412114441394806, + "rewards/progression_diversity/std": 0.1364642083644867, + "rewards/symbolic_reward_accuracy/mean": 0.86328125, + "rewards/symbolic_reward_accuracy/std": 0.3438861668109894, + "rewards/symbolic_reward_partial_score/mean": 0.8994140625, + "rewards/symbolic_reward_partial_score/std": 0.2765096426010132, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1178048849105835, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 216.0, + "sampling/sampling_logp_difference/mean": 0.7162982225418091, + "step": 1009 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.28954601287841797, + "epoch": 2.6578947368421053, + "grad_norm": 10.927779197692871, + "learning_rate": 1e-06, + "loss": 0.1193, + "step": 1010 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.33044156432151794, + "epoch": 2.6605263157894736, + "grad_norm": 0.021338341757655144, + "learning_rate": 1e-06, + "loss": 0.3381, + "step": 1011 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.287820965051651, + "epoch": 2.663157894736842, + "grad_norm": 0.009080777876079082, + "learning_rate": 1e-06, + "loss": 0.1125, + "step": 1012 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13928.0, + "completions/mean_length": 1759.529296875, + "completions/mean_terminated_length": 620.3599853515625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.3139393925666809, + "epoch": 2.6657894736842103, + "frac_reward_zero_std": 0.28125, + "grad_norm": 21.068523406982422, + "learning_rate": 1e-06, + "loss": 0.1538, + "num_tokens": 316735123.0, + "reward": 0.7530990839004517, + "reward_std": 0.20721742510795593, + "rewards/progression_diversity/mean": -0.036777839064598083, + "rewards/progression_diversity/std": 0.12973876297473907, + "rewards/symbolic_reward_accuracy/mean": 0.82421875, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.88720703125, + "rewards/symbolic_reward_partial_score/std": 0.28543245792388916, + "rewards/tag_count_reward/mean": -0.072265625, + "rewards/tag_count_reward/std": 0.2591804563999176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1145870685577393, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 218.0, + "sampling/sampling_logp_difference/mean": 0.6381403803825378, + "step": 1013 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2958460748195648, + "epoch": 2.668421052631579, + "grad_norm": 0.33650216460227966, + "learning_rate": 1e-06, + "loss": 0.0614, + "step": 1014 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3110879957675934, + "epoch": 2.6710526315789473, + "grad_norm": 0.006965796463191509, + "learning_rate": 1e-06, + "loss": 0.1327, + "step": 1015 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.31926093995571136, + "epoch": 2.6736842105263157, + "grad_norm": 0.009219714440405369, + "learning_rate": 1e-06, + "loss": 0.17, + "step": 1016 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15009.0, + "completions/mean_length": 1719.177734375, + "completions/mean_terminated_length": 676.0731811523438, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.3213798254728317, + "epoch": 2.6763157894736844, + "frac_reward_zero_std": 0.09375, + "grad_norm": 36.93714904785156, + "learning_rate": 1e-06, + "loss": 0.1665, + "num_tokens": 317986126.0, + "reward": 0.7775270938873291, + "reward_std": 0.24769052863121033, + "rewards/progression_diversity/mean": -0.03537972271442413, + "rewards/progression_diversity/std": 0.1272072046995163, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.90673828125, + "rewards/symbolic_reward_partial_score/std": 0.263979971408844, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1412655115127563, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 219.0, + "sampling/sampling_logp_difference/mean": 0.8786369562149048, + "step": 1017 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3090389519929886, + "epoch": 2.6789473684210527, + "grad_norm": 69.12992095947266, + "learning_rate": 1e-06, + "loss": 0.1427, + "step": 1018 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.333890438079834, + "epoch": 2.681578947368421, + "grad_norm": 0.026694627478718758, + "learning_rate": 1e-06, + "loss": 0.2929, + "step": 1019 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.316163033246994, + "epoch": 2.6842105263157894, + "grad_norm": 0.014393487945199013, + "learning_rate": 1e-06, + "loss": 0.1923, + "step": 1020 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15286.0, + "completions/mean_length": 3114.5078125, + "completions/mean_terminated_length": 729.668212890625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.36883601546287537, + "epoch": 2.6868421052631577, + "frac_reward_zero_std": 0.125, + "grad_norm": 56.34549331665039, + "learning_rate": 1e-06, + "loss": 0.3005, + "num_tokens": 320001746.0, + "reward": 0.7100275158882141, + "reward_std": 0.24902528524398804, + "rewards/progression_diversity/mean": -0.07147189974784851, + "rewards/progression_diversity/std": 0.16751477122306824, + "rewards/symbolic_reward_accuracy/mean": 0.7890625, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.84375, + "rewards/symbolic_reward_partial_score/std": 0.33665984869003296, + "rewards/tag_count_reward/mean": -0.158203125, + "rewards/tag_count_reward/std": 0.36528825759887695, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1642587184906006, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 220.0, + "sampling/sampling_logp_difference/mean": 1.0265074968338013, + "step": 1021 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.33186276257038116, + "epoch": 2.6894736842105265, + "grad_norm": 2.4869983196258545, + "learning_rate": 1e-06, + "loss": 0.2048, + "step": 1022 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3606336712837219, + "epoch": 2.692105263157895, + "grad_norm": 1.9981337785720825, + "learning_rate": 1e-06, + "loss": 0.2636, + "step": 1023 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.35093724727630615, + "epoch": 2.694736842105263, + "grad_norm": 1.4287171363830566, + "learning_rate": 1e-06, + "loss": 0.2628, + "step": 1024 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14680.0, + "completions/mean_length": 2566.037109375, + "completions/mean_terminated_length": 731.7942504882812, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.3709093779325485, + "epoch": 2.6973684210526314, + "frac_reward_zero_std": 0.09375, + "grad_norm": 38.17184829711914, + "learning_rate": 1e-06, + "loss": 0.4175, + "num_tokens": 321699173.0, + "reward": 0.709370493888855, + "reward_std": 0.2614933252334595, + "rewards/progression_diversity/mean": -0.059050630778074265, + "rewards/progression_diversity/std": 0.1578987091779709, + "rewards/symbolic_reward_accuracy/mean": 0.783203125, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.83984375, + "rewards/symbolic_reward_partial_score/std": 0.3304028809070587, + "rewards/tag_count_reward/mean": -0.119140625, + "rewards/tag_count_reward/std": 0.32427072525024414, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1661169528961182, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 220.0, + "sampling/sampling_logp_difference/mean": 1.1539242267608643, + "step": 1025 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.40625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.3150624781847, + "epoch": 2.7, + "grad_norm": 1.1932648420333862, + "learning_rate": 1e-06, + "loss": 0.1208, + "step": 1026 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.35171347856521606, + "epoch": 2.7026315789473685, + "grad_norm": 0.0064244456589221954, + "learning_rate": 1e-06, + "loss": 0.3091, + "step": 1027 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.32932211458683014, + "epoch": 2.705263157894737, + "grad_norm": 0.20103150606155396, + "learning_rate": 1e-06, + "loss": 0.1512, + "step": 1028 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14449.0, + "completions/mean_length": 2274.8984375, + "completions/mean_terminated_length": 714.021728515625, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.3005029112100601, + "epoch": 2.707894736842105, + "frac_reward_zero_std": 0.1875, + "grad_norm": 63.158145904541016, + "learning_rate": 1e-06, + "loss": 0.1651, + "num_tokens": 323267889.0, + "reward": 0.7657883167266846, + "reward_std": 0.2607175409793854, + "rewards/progression_diversity/mean": -0.05203522741794586, + "rewards/progression_diversity/std": 0.15054935216903687, + "rewards/symbolic_reward_accuracy/mean": 0.849609375, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.8870442509651184, + "rewards/symbolic_reward_partial_score/std": 0.29508423805236816, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.148714303970337, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 221.0, + "sampling/sampling_logp_difference/mean": 1.0465000867843628, + "step": 1029 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3231324404478073, + "epoch": 2.7105263157894735, + "grad_norm": 0.05354348570108414, + "learning_rate": 1e-06, + "loss": 0.175, + "step": 1030 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3530917465686798, + "epoch": 2.713157894736842, + "grad_norm": 0.014254847541451454, + "learning_rate": 1e-06, + "loss": 0.2848, + "step": 1031 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3636767566204071, + "epoch": 2.7157894736842105, + "grad_norm": 0.34099018573760986, + "learning_rate": 1e-06, + "loss": 0.3258, + "step": 1032 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15208.0, + "completions/mean_length": 3406.103515625, + "completions/mean_terminated_length": 859.0396728515625, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.3606864959001541, + "epoch": 2.718421052631579, + "frac_reward_zero_std": 0.0, + "grad_norm": 110.6368637084961, + "learning_rate": 1e-06, + "loss": 0.2805, + "num_tokens": 325423462.0, + "reward": 0.6178954243659973, + "reward_std": 0.3361561894416809, + "rewards/progression_diversity/mean": -0.08546093106269836, + "rewards/progression_diversity/std": 0.18625648319721222, + "rewards/symbolic_reward_accuracy/mean": 0.666015625, + "rewards/symbolic_reward_accuracy/std": 0.47209542989730835, + "rewards/symbolic_reward_partial_score/mean": 0.7877604365348816, + "rewards/symbolic_reward_partial_score/std": 0.3652539551258087, + "rewards/tag_count_reward/mean": -0.171875, + "rewards/tag_count_reward/std": 0.3776407241821289, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1710354089736938, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 224.0, + "sampling/sampling_logp_difference/mean": 1.457003116607666, + "step": 1033 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.36586907505989075, + "epoch": 2.7210526315789476, + "grad_norm": 47.91674041748047, + "learning_rate": 1e-06, + "loss": 0.3584, + "step": 1034 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.2421875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.35853834450244904, + "epoch": 2.723684210526316, + "grad_norm": 1.4703402519226074, + "learning_rate": 1e-06, + "loss": 0.2924, + "step": 1035 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3385375142097473, + "epoch": 2.7263157894736842, + "grad_norm": 10.245598793029785, + "learning_rate": 1e-06, + "loss": 0.1748, + "step": 1036 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10898.0, + "completions/mean_length": 2200.62109375, + "completions/mean_terminated_length": 597.2825927734375, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.30844029784202576, + "epoch": 2.7289473684210526, + "frac_reward_zero_std": 0.21875, + "grad_norm": 21.34285545349121, + "learning_rate": 1e-06, + "loss": 0.0902, + "num_tokens": 326975684.0, + "reward": 0.7785071730613708, + "reward_std": 0.2188257873058319, + "rewards/progression_diversity/mean": -0.05456160753965378, + "rewards/progression_diversity/std": 0.16060733795166016, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.9073892831802368, + "rewards/symbolic_reward_partial_score/std": 0.2701605260372162, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1437357664108276, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 223.0, + "sampling/sampling_logp_difference/mean": 1.1044782400131226, + "step": 1037 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.35305924713611603, + "epoch": 2.731578947368421, + "grad_norm": 2.7072434425354004, + "learning_rate": 1e-06, + "loss": 0.3355, + "step": 1038 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.31338661909103394, + "epoch": 2.734210526315789, + "grad_norm": 16.57544708251953, + "learning_rate": 1e-06, + "loss": 0.1466, + "step": 1039 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3223544955253601, + "epoch": 2.736842105263158, + "grad_norm": 0.008024279028177261, + "learning_rate": 1e-06, + "loss": 0.2434, + "step": 1040 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14168.0, + "completions/mean_length": 2421.57421875, + "completions/mean_terminated_length": 637.8281860351562, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.3547728508710861, + "epoch": 2.7394736842105263, + "frac_reward_zero_std": 0.09375, + "grad_norm": 86.12596130371094, + "learning_rate": 1e-06, + "loss": 0.3437, + "num_tokens": 328613322.0, + "reward": 0.7703942060470581, + "reward_std": 0.2505676746368408, + "rewards/progression_diversity/mean": -0.06019480898976326, + "rewards/progression_diversity/std": 0.1668085753917694, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.9000651240348816, + "rewards/symbolic_reward_partial_score/std": 0.27447789907455444, + "rewards/tag_count_reward/mean": -0.111328125, + "rewards/tag_count_reward/std": 0.31484565138816833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.165305256843567, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 227.0, + "sampling/sampling_logp_difference/mean": 1.1906356811523438, + "step": 1041 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4296875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3167977035045624, + "epoch": 2.7421052631578946, + "grad_norm": 6.371710300445557, + "learning_rate": 1e-06, + "loss": 0.1303, + "step": 1042 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.3315960764884949, + "epoch": 2.7447368421052634, + "grad_norm": 22.913915634155273, + "learning_rate": 1e-06, + "loss": 0.2824, + "step": 1043 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3117201626300812, + "epoch": 2.7473684210526317, + "grad_norm": 5.115964412689209, + "learning_rate": 1e-06, + "loss": 0.253, + "step": 1044 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13730.0, + "completions/mean_length": 2225.83984375, + "completions/mean_terminated_length": 659.5358276367188, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.31773585081100464, + "epoch": 2.75, + "frac_reward_zero_std": 0.1875, + "grad_norm": 119.56202697753906, + "learning_rate": 1e-06, + "loss": 0.1282, + "num_tokens": 330152088.0, + "reward": 0.742752194404602, + "reward_std": 0.24866357445716858, + "rewards/progression_diversity/mean": -0.055839426815509796, + "rewards/progression_diversity/std": 0.16208583116531372, + "rewards/symbolic_reward_accuracy/mean": 0.814453125, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.88134765625, + "rewards/symbolic_reward_partial_score/std": 0.2882753908634186, + "rewards/tag_count_reward/mean": -0.09765625, + "rewards/tag_count_reward/std": 0.29713961482048035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1512091159820557, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 230.999267578125, + "sampling/sampling_logp_difference/mean": 1.0916484594345093, + "step": 1045 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3315199911594391, + "epoch": 2.7526315789473683, + "grad_norm": 85.15950012207031, + "learning_rate": 1e-06, + "loss": 0.2759, + "step": 1046 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3466692268848419, + "epoch": 2.7552631578947366, + "grad_norm": 0.012678657658398151, + "learning_rate": 1e-06, + "loss": 0.2945, + "step": 1047 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.31701667606830597, + "epoch": 2.7578947368421054, + "grad_norm": 0.01987772062420845, + "learning_rate": 1e-06, + "loss": 0.1537, + "step": 1048 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13047.0, + "completions/mean_length": 2842.43359375, + "completions/mean_terminated_length": 626.5408935546875, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.32595211267471313, + "epoch": 2.7605263157894737, + "frac_reward_zero_std": 0.125, + "grad_norm": 42.63386535644531, + "learning_rate": 1e-06, + "loss": 0.1182, + "num_tokens": 332023574.0, + "reward": 0.6774349212646484, + "reward_std": 0.2632233202457428, + "rewards/progression_diversity/mean": -0.06901118159294128, + "rewards/progression_diversity/std": 0.17166905105113983, + "rewards/symbolic_reward_accuracy/mean": 0.734375, + "rewards/symbolic_reward_accuracy/std": 0.44209739565849304, + "rewards/symbolic_reward_partial_score/mean": 0.8352864980697632, + "rewards/symbolic_reward_partial_score/std": 0.32469478249549866, + "rewards/tag_count_reward/mean": -0.130859375, + "rewards/tag_count_reward/std": 0.33757632970809937, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.155900239944458, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 234.0, + "sampling/sampling_logp_difference/mean": 1.1260725259780884, + "step": 1049 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.3452259302139282, + "epoch": 2.763157894736842, + "grad_norm": 5.842362403869629, + "learning_rate": 1e-06, + "loss": 0.1744, + "step": 1050 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3828125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.34844136238098145, + "epoch": 2.765789473684211, + "grad_norm": 14.326484680175781, + "learning_rate": 1e-06, + "loss": 0.2425, + "step": 1051 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.35987676680088043, + "epoch": 2.768421052631579, + "grad_norm": 30.41329002380371, + "learning_rate": 1e-06, + "loss": 0.2773, + "step": 1052 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12739.0, + "completions/mean_length": 2201.59375, + "completions/mean_terminated_length": 666.7012939453125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.32392917573451996, + "epoch": 2.7710526315789474, + "frac_reward_zero_std": 0.25, + "grad_norm": 95.98092651367188, + "learning_rate": 1e-06, + "loss": 0.1847, + "num_tokens": 333554758.0, + "reward": 0.7248282432556152, + "reward_std": 0.2060919851064682, + "rewards/progression_diversity/mean": -0.04647231847047806, + "rewards/progression_diversity/std": 0.1398431360721588, + "rewards/symbolic_reward_accuracy/mean": 0.791015625, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.8675130009651184, + "rewards/symbolic_reward_partial_score/std": 0.2977105677127838, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1349936723709106, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 236.0, + "sampling/sampling_logp_difference/mean": 0.8933988809585571, + "step": 1053 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.33513136208057404, + "epoch": 2.7736842105263158, + "grad_norm": 0.05125413089990616, + "learning_rate": 1e-06, + "loss": 0.139, + "step": 1054 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.32300393283367157, + "epoch": 2.776315789473684, + "grad_norm": 2.2907490730285645, + "learning_rate": 1e-06, + "loss": 0.1541, + "step": 1055 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.32895462214946747, + "epoch": 2.7789473684210524, + "grad_norm": 0.22027532756328583, + "learning_rate": 1e-06, + "loss": 0.1606, + "step": 1056 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12516.0, + "completions/mean_length": 1867.125, + "completions/mean_terminated_length": 536.1535034179688, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.32684555649757385, + "epoch": 2.781578947368421, + "frac_reward_zero_std": 0.3125, + "grad_norm": 27.86194610595703, + "learning_rate": 1e-06, + "loss": 0.1246, + "num_tokens": 334916038.0, + "reward": 0.7695657014846802, + "reward_std": 0.18517985939979553, + "rewards/progression_diversity/mean": -0.040504228323698044, + "rewards/progression_diversity/std": 0.13616561889648438, + "rewards/symbolic_reward_accuracy/mean": 0.845703125, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.89990234375, + "rewards/symbolic_reward_partial_score/std": 0.2672683298587799, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1254425048828125, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 236.0, + "sampling/sampling_logp_difference/mean": 0.741882860660553, + "step": 1057 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3313567191362381, + "epoch": 2.7842105263157895, + "grad_norm": 6.761070728302002, + "learning_rate": 1e-06, + "loss": 0.2295, + "step": 1058 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3324154317378998, + "epoch": 2.786842105263158, + "grad_norm": 0.011156097054481506, + "learning_rate": 1e-06, + "loss": 0.0708, + "step": 1059 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.33027806878089905, + "epoch": 2.7894736842105265, + "grad_norm": 0.16176286339759827, + "learning_rate": 1e-06, + "loss": 0.1187, + "step": 1060 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15293.0, + "completions/mean_length": 2273.78125, + "completions/mean_terminated_length": 678.7130126953125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.3395443707704544, + "epoch": 2.792105263157895, + "frac_reward_zero_std": 0.21875, + "grad_norm": 84.5785140991211, + "learning_rate": 1e-06, + "loss": 0.3383, + "num_tokens": 336512054.0, + "reward": 0.7558015584945679, + "reward_std": 0.23675483465194702, + "rewards/progression_diversity/mean": -0.05461447685956955, + "rewards/progression_diversity/std": 0.15894843637943268, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.8889973759651184, + "rewards/symbolic_reward_partial_score/std": 0.28549298644065857, + "rewards/tag_count_reward/mean": -0.107421875, + "rewards/tag_count_reward/std": 0.30995169281959534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1387587785720825, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 236.0, + "sampling/sampling_logp_difference/mean": 0.9233871698379517, + "step": 1061 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3283067047595978, + "epoch": 2.794736842105263, + "grad_norm": 13.06532096862793, + "learning_rate": 1e-06, + "loss": 0.2576, + "step": 1062 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3241392523050308, + "epoch": 2.7973684210526315, + "grad_norm": 1.6975409984588623, + "learning_rate": 1e-06, + "loss": 0.0366, + "step": 1063 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.31890368461608887, + "epoch": 2.8, + "grad_norm": 10.018250465393066, + "learning_rate": 1e-06, + "loss": 0.1212, + "step": 1064 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14169.0, + "completions/mean_length": 2099.845703125, + "completions/mean_terminated_length": 622.174560546875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.3201150894165039, + "epoch": 2.8026315789473686, + "frac_reward_zero_std": 0.125, + "grad_norm": 316.5431823730469, + "learning_rate": 1e-06, + "loss": 0.188, + "num_tokens": 338001831.0, + "reward": 0.7660241723060608, + "reward_std": 0.2371438443660736, + "rewards/progression_diversity/mean": -0.052859190851449966, + "rewards/progression_diversity/std": 0.16034673154354095, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.8904622793197632, + "rewards/symbolic_reward_partial_score/std": 0.28877806663513184, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1355504989624023, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 236.0, + "sampling/sampling_logp_difference/mean": 0.9679932594299316, + "step": 1065 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.32754945755004883, + "epoch": 2.805263157894737, + "grad_norm": 55.01374053955078, + "learning_rate": 1e-06, + "loss": 0.155, + "step": 1066 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.33194398880004883, + "epoch": 2.807894736842105, + "grad_norm": 0.01747787557542324, + "learning_rate": 1e-06, + "loss": 0.2416, + "step": 1067 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.33147796988487244, + "epoch": 2.8105263157894735, + "grad_norm": 0.019125865772366524, + "learning_rate": 1e-06, + "loss": 0.2642, + "step": 1068 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14000.0, + "completions/mean_length": 2323.65625, + "completions/mean_terminated_length": 596.9473876953125, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.33723948895931244, + "epoch": 2.8131578947368423, + "frac_reward_zero_std": 0.1875, + "grad_norm": 55.272857666015625, + "learning_rate": 1e-06, + "loss": 0.2617, + "num_tokens": 339580151.0, + "reward": 0.7574684619903564, + "reward_std": 0.24873128533363342, + "rewards/progression_diversity/mean": -0.06370329856872559, + "rewards/progression_diversity/std": 0.176786869764328, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.8811849355697632, + "rewards/symbolic_reward_partial_score/std": 0.2997874319553375, + "rewards/tag_count_reward/mean": -0.1015625, + "rewards/tag_count_reward/std": 0.30236753821372986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1518162488937378, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 235.0, + "sampling/sampling_logp_difference/mean": 1.1923803091049194, + "step": 1069 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3266005963087082, + "epoch": 2.8157894736842106, + "grad_norm": 14.835233688354492, + "learning_rate": 1e-06, + "loss": 0.1812, + "step": 1070 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3344879746437073, + "epoch": 2.818421052631579, + "grad_norm": 20.65220832824707, + "learning_rate": 1e-06, + "loss": 0.2772, + "step": 1071 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.32967130839824677, + "epoch": 2.8210526315789473, + "grad_norm": 15.066210746765137, + "learning_rate": 1e-06, + "loss": 0.2274, + "step": 1072 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13483.0, + "completions/mean_length": 2695.2265625, + "completions/mean_terminated_length": 739.6875610351562, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.3279756009578705, + "epoch": 2.8236842105263156, + "frac_reward_zero_std": 0.125, + "grad_norm": 82.50452423095703, + "learning_rate": 1e-06, + "loss": 0.1288, + "num_tokens": 341365259.0, + "reward": 0.6973080039024353, + "reward_std": 0.2565169632434845, + "rewards/progression_diversity/mean": -0.06900518387556076, + "rewards/progression_diversity/std": 0.174638569355011, + "rewards/symbolic_reward_accuracy/mean": 0.763671875, + "rewards/symbolic_reward_accuracy/std": 0.42524150013923645, + "rewards/symbolic_reward_partial_score/mean": 0.8409830331802368, + "rewards/symbolic_reward_partial_score/std": 0.327880322933197, + "rewards/tag_count_reward/mean": -0.125, + "rewards/tag_count_reward/std": 0.3310423493385315, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.146012783050537, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 238.0, + "sampling/sampling_logp_difference/mean": 1.2247239351272583, + "step": 1073 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3268892467021942, + "epoch": 2.8263157894736843, + "grad_norm": 0.03915620595216751, + "learning_rate": 1e-06, + "loss": 0.3393, + "step": 1074 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3162257820367813, + "epoch": 2.8289473684210527, + "grad_norm": 1.8628556728363037, + "learning_rate": 1e-06, + "loss": 0.1444, + "step": 1075 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.32084639370441437, + "epoch": 2.831578947368421, + "grad_norm": 0.034546032547950745, + "learning_rate": 1e-06, + "loss": 0.2549, + "step": 1076 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.119140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13901.0, + "completions/mean_length": 2555.38671875, + "completions/mean_terminated_length": 684.997802734375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.31973057985305786, + "epoch": 2.8342105263157897, + "frac_reward_zero_std": 0.09375, + "grad_norm": 80.3057861328125, + "learning_rate": 1e-06, + "loss": 0.1383, + "num_tokens": 343049393.0, + "reward": 0.7412656545639038, + "reward_std": 0.25144657492637634, + "rewards/progression_diversity/mean": -0.06289247423410416, + "rewards/progression_diversity/std": 0.16770640015602112, + "rewards/symbolic_reward_accuracy/mean": 0.8203125, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.8714193105697632, + "rewards/symbolic_reward_partial_score/std": 0.30579882860183716, + "rewards/tag_count_reward/mean": -0.1171875, + "rewards/tag_count_reward/std": 0.32195815443992615, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1399236917495728, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 239.99998474121094, + "sampling/sampling_logp_difference/mean": 1.2581195831298828, + "step": 1077 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3828125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.31444528698921204, + "epoch": 2.836842105263158, + "grad_norm": 55.245845794677734, + "learning_rate": 1e-06, + "loss": 0.2088, + "step": 1078 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4921875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.32562339305877686, + "epoch": 2.8394736842105264, + "grad_norm": 34.23236846923828, + "learning_rate": 1e-06, + "loss": 0.2752, + "step": 1079 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.40625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.32775624096393585, + "epoch": 2.8421052631578947, + "grad_norm": 23.81574821472168, + "learning_rate": 1e-06, + "loss": 0.2996, + "step": 1080 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9691.0, + "completions/mean_length": 2100.416015625, + "completions/mean_terminated_length": 554.5736083984375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.3156034052371979, + "epoch": 2.844736842105263, + "frac_reward_zero_std": 0.21875, + "grad_norm": 43.40895462036133, + "learning_rate": 1e-06, + "loss": 0.1877, + "num_tokens": 344549286.0, + "reward": 0.7650686502456665, + "reward_std": 0.24320955574512482, + "rewards/progression_diversity/mean": -0.05075865983963013, + "rewards/progression_diversity/std": 0.15617826581001282, + "rewards/symbolic_reward_accuracy/mean": 0.845703125, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.8904622793197632, + "rewards/symbolic_reward_partial_score/std": 0.2910281717777252, + "rewards/tag_count_reward/mean": -0.08984375, + "rewards/tag_count_reward/std": 0.2862374484539032, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1255333423614502, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 240.0, + "sampling/sampling_logp_difference/mean": 1.064498782157898, + "step": 1081 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.30958323180675507, + "epoch": 2.8473684210526313, + "grad_norm": 9.998337745666504, + "learning_rate": 1e-06, + "loss": 0.1545, + "step": 1082 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3141885995864868, + "epoch": 2.85, + "grad_norm": 0.007370346691459417, + "learning_rate": 1e-06, + "loss": 0.2394, + "step": 1083 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3183276653289795, + "epoch": 2.8526315789473684, + "grad_norm": 0.008199164643883705, + "learning_rate": 1e-06, + "loss": 0.2117, + "step": 1084 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13934.0, + "completions/mean_length": 2409.056640625, + "completions/mean_terminated_length": 623.71142578125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.323482483625412, + "epoch": 2.8552631578947367, + "frac_reward_zero_std": 0.1875, + "grad_norm": 53.07914352416992, + "learning_rate": 1e-06, + "loss": 0.1864, + "num_tokens": 346174179.0, + "reward": 0.7465149164199829, + "reward_std": 0.27112072706222534, + "rewards/progression_diversity/mean": -0.06531209498643875, + "rewards/progression_diversity/std": 0.1783425658941269, + "rewards/symbolic_reward_accuracy/mean": 0.82421875, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.8785806894302368, + "rewards/symbolic_reward_partial_score/std": 0.3004186153411865, + "rewards/tag_count_reward/mean": -0.109375, + "rewards/tag_count_reward/std": 0.31241437792778015, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1350785493850708, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 240.0, + "sampling/sampling_logp_difference/mean": 1.2513222694396973, + "step": 1085 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.32216334342956543, + "epoch": 2.8578947368421055, + "grad_norm": 3.4325926303863525, + "learning_rate": 1e-06, + "loss": 0.2676, + "step": 1086 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3240419775247574, + "epoch": 2.860526315789474, + "grad_norm": 15.382451057434082, + "learning_rate": 1e-06, + "loss": 0.2478, + "step": 1087 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.32489949464797974, + "epoch": 2.863157894736842, + "grad_norm": 0.019483868032693863, + "learning_rate": 1e-06, + "loss": 0.1895, + "step": 1088 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12305.0, + "completions/mean_length": 2428.5703125, + "completions/mean_terminated_length": 645.718017578125, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.3152479976415634, + "epoch": 2.8657894736842104, + "frac_reward_zero_std": 0.21875, + "grad_norm": 148.0259246826172, + "learning_rate": 1e-06, + "loss": 0.1724, + "num_tokens": 347824423.0, + "reward": 0.7290340662002563, + "reward_std": 0.232526957988739, + "rewards/progression_diversity/mean": -0.0604589506983757, + "rewards/progression_diversity/std": 0.16615836322307587, + "rewards/symbolic_reward_accuracy/mean": 0.80078125, + "rewards/symbolic_reward_accuracy/std": 0.39980348944664, + "rewards/symbolic_reward_partial_score/mean": 0.8663736581802368, + "rewards/symbolic_reward_partial_score/std": 0.31190311908721924, + "rewards/tag_count_reward/mean": -0.107421875, + "rewards/tag_count_reward/std": 0.30995169281959534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.127872109413147, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 240.0, + "sampling/sampling_logp_difference/mean": 1.2021994590759277, + "step": 1089 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.3240260183811188, + "epoch": 2.8684210526315788, + "grad_norm": 14.310028076171875, + "learning_rate": 1e-06, + "loss": 0.2462, + "step": 1090 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.31351564824581146, + "epoch": 2.8710526315789475, + "grad_norm": 0.017242752015590668, + "learning_rate": 1e-06, + "loss": 0.0796, + "step": 1091 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3828125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5390625, + "entropy": 0.3323921114206314, + "epoch": 2.873684210526316, + "grad_norm": 0.0029714410193264484, + "learning_rate": 1e-06, + "loss": 0.3207, + "step": 1092 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.107421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16382.0, + "completions/mean_length": 2427.0859375, + "completions/mean_terminated_length": 747.3698120117188, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.31317219138145447, + "epoch": 2.876315789473684, + "frac_reward_zero_std": 0.1875, + "grad_norm": 36.501434326171875, + "learning_rate": 1e-06, + "loss": 0.2128, + "num_tokens": 349474067.0, + "reward": 0.7331509590148926, + "reward_std": 0.24093718826770782, + "rewards/progression_diversity/mean": -0.05892288684844971, + "rewards/progression_diversity/std": 0.1655765026807785, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.86767578125, + "rewards/symbolic_reward_partial_score/std": 0.31132596731185913, + "rewards/tag_count_reward/mean": -0.10546875, + "rewards/tag_count_reward/std": 0.3074568510055542, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.153733253479004, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 241.0, + "sampling/sampling_logp_difference/mean": 1.242756724357605, + "step": 1093 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.3204523026943207, + "epoch": 2.8789473684210525, + "grad_norm": 3.31962513923645, + "learning_rate": 1e-06, + "loss": 0.2336, + "step": 1094 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.31440404057502747, + "epoch": 2.8815789473684212, + "grad_norm": 7.084986686706543, + "learning_rate": 1e-06, + "loss": 0.1256, + "step": 1095 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.32728545367717743, + "epoch": 2.8842105263157896, + "grad_norm": 0.10335738956928253, + "learning_rate": 1e-06, + "loss": 0.2388, + "step": 1096 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.142578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16244.0, + "completions/mean_length": 2942.568359375, + "completions/mean_terminated_length": 707.432861328125, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.32194942235946655, + "epoch": 2.886842105263158, + "frac_reward_zero_std": 0.1875, + "grad_norm": 62.69070053100586, + "learning_rate": 1e-06, + "loss": 0.2512, + "num_tokens": 351364438.0, + "reward": 0.6914088726043701, + "reward_std": 0.22169052064418793, + "rewards/progression_diversity/mean": -0.07298420369625092, + "rewards/progression_diversity/std": 0.1761486977338791, + "rewards/symbolic_reward_accuracy/mean": 0.7578125, + "rewards/symbolic_reward_accuracy/std": 0.42882615327835083, + "rewards/symbolic_reward_partial_score/mean": 0.8409830331802368, + "rewards/symbolic_reward_partial_score/std": 0.3356630504131317, + "rewards/tag_count_reward/mean": -0.1484375, + "rewards/tag_count_reward/std": 0.35588082671165466, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.162922978401184, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 243.0, + "sampling/sampling_logp_difference/mean": 1.4567382335662842, + "step": 1097 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.32985997200012207, + "epoch": 2.889473684210526, + "grad_norm": 2.9098451137542725, + "learning_rate": 1e-06, + "loss": 0.2758, + "step": 1098 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3266884684562683, + "epoch": 2.8921052631578945, + "grad_norm": 0.03355651721358299, + "learning_rate": 1e-06, + "loss": 0.1963, + "step": 1099 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.32841236889362335, + "epoch": 2.8947368421052633, + "grad_norm": 6.091196537017822, + "learning_rate": 1e-06, + "loss": 0.2498, + "step": 1100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16313.0, + "completions/mean_length": 2224.619140625, + "completions/mean_terminated_length": 589.657958984375, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.3271956741809845, + "epoch": 2.8973684210526316, + "frac_reward_zero_std": 0.25, + "grad_norm": 43.073707580566406, + "learning_rate": 1e-06, + "loss": 0.1519, + "num_tokens": 352894899.0, + "reward": 0.7742513418197632, + "reward_std": 0.20133471488952637, + "rewards/progression_diversity/mean": -0.05045757442712784, + "rewards/progression_diversity/std": 0.14818480610847473, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.89697265625, + "rewards/symbolic_reward_partial_score/std": 0.2875053882598877, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1496270895004272, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 246.0, + "sampling/sampling_logp_difference/mean": 1.6051921844482422, + "step": 1101 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3244181126356125, + "epoch": 2.9, + "grad_norm": 0.012723026797175407, + "learning_rate": 1e-06, + "loss": 0.2311, + "step": 1102 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.33277395367622375, + "epoch": 2.9026315789473687, + "grad_norm": 2.2688472270965576, + "learning_rate": 1e-06, + "loss": 0.2433, + "step": 1103 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.33416812121868134, + "epoch": 2.905263157894737, + "grad_norm": 5.079883098602295, + "learning_rate": 1e-06, + "loss": 0.1993, + "step": 1104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.080078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12811.0, + "completions/mean_length": 1826.296875, + "completions/mean_terminated_length": 559.0658569335938, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.3188180774450302, + "epoch": 2.9078947368421053, + "frac_reward_zero_std": 0.21875, + "grad_norm": 23.28694725036621, + "learning_rate": 1e-06, + "loss": 0.1536, + "num_tokens": 354244779.0, + "reward": 0.7635840177536011, + "reward_std": 0.22885069251060486, + "rewards/progression_diversity/mean": -0.038090769201517105, + "rewards/progression_diversity/std": 0.12966662645339966, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.9059244394302368, + "rewards/symbolic_reward_partial_score/std": 0.2578800618648529, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1475815773010254, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 248.0, + "sampling/sampling_logp_difference/mean": 1.869348168373108, + "step": 1105 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3273351788520813, + "epoch": 2.9105263157894736, + "grad_norm": 4.7783918380737305, + "learning_rate": 1e-06, + "loss": 0.1048, + "step": 1106 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3305172771215439, + "epoch": 2.913157894736842, + "grad_norm": 0.007633809465914965, + "learning_rate": 1e-06, + "loss": 0.2638, + "step": 1107 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.333538293838501, + "epoch": 2.9157894736842103, + "grad_norm": 0.015457144938409328, + "learning_rate": 1e-06, + "loss": 0.2158, + "step": 1108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16183.0, + "completions/mean_length": 2176.537109375, + "completions/mean_terminated_length": 604.7787475585938, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.3226548731327057, + "epoch": 2.918421052631579, + "frac_reward_zero_std": 0.15625, + "grad_norm": 108.32244873046875, + "learning_rate": 1e-06, + "loss": 0.1848, + "num_tokens": 355747614.0, + "reward": 0.7507563233375549, + "reward_std": 0.25011780858039856, + "rewards/progression_diversity/mean": -0.04643935710191727, + "rewards/progression_diversity/std": 0.1406102478504181, + "rewards/symbolic_reward_accuracy/mean": 0.826171875, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.8816731572151184, + "rewards/symbolic_reward_partial_score/std": 0.2946558892726898, + "rewards/tag_count_reward/mean": -0.08984375, + "rewards/tag_count_reward/std": 0.2862374484539032, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1501786708831787, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 252.0, + "sampling/sampling_logp_difference/mean": 2.1973791122436523, + "step": 1109 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.32274968922138214, + "epoch": 2.9210526315789473, + "grad_norm": 0.011330176144838333, + "learning_rate": 1e-06, + "loss": 0.3078, + "step": 1110 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.31783410906791687, + "epoch": 2.9236842105263157, + "grad_norm": 0.008597995154559612, + "learning_rate": 1e-06, + "loss": 0.1984, + "step": 1111 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3165570944547653, + "epoch": 2.9263157894736844, + "grad_norm": 0.3951316773891449, + "learning_rate": 1e-06, + "loss": 0.2277, + "step": 1112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11247.0, + "completions/mean_length": 2216.048828125, + "completions/mean_terminated_length": 614.454345703125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.3109191954135895, + "epoch": 2.9289473684210527, + "frac_reward_zero_std": 0.15625, + "grad_norm": 183.47438049316406, + "learning_rate": 1e-06, + "loss": 0.3146, + "num_tokens": 357277879.0, + "reward": 0.769425630569458, + "reward_std": 0.23327568173408508, + "rewards/progression_diversity/mean": -0.04474321007728577, + "rewards/progression_diversity/std": 0.13221436738967896, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.8937174081802368, + "rewards/symbolic_reward_partial_score/std": 0.2835972309112549, + "rewards/tag_count_reward/mean": -0.103515625, + "rewards/tag_count_reward/std": 0.30492907762527466, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1350001096725464, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 254.0, + "sampling/sampling_logp_difference/mean": 2.6123390197753906, + "step": 1113 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.32664044201374054, + "epoch": 2.931578947368421, + "grad_norm": 84.57691192626953, + "learning_rate": 1e-06, + "loss": 0.2191, + "step": 1114 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.31761452555656433, + "epoch": 2.9342105263157894, + "grad_norm": 0.06620530039072037, + "learning_rate": 1e-06, + "loss": 0.1672, + "step": 1115 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3253130316734314, + "epoch": 2.9368421052631577, + "grad_norm": 0.013269034214317799, + "learning_rate": 1e-06, + "loss": 0.1481, + "step": 1116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.14453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9632.0, + "completions/mean_length": 2862.21875, + "completions/mean_terminated_length": 577.7168579101562, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.33379143476486206, + "epoch": 2.9394736842105265, + "frac_reward_zero_std": 0.0625, + "grad_norm": 188.9947052001953, + "learning_rate": 1e-06, + "loss": 0.2634, + "num_tokens": 359162663.0, + "reward": 0.7037363648414612, + "reward_std": 0.26258164644241333, + "rewards/progression_diversity/mean": -0.05605129897594452, + "rewards/progression_diversity/std": 0.14197811484336853, + "rewards/symbolic_reward_accuracy/mean": 0.76953125, + "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, + "rewards/symbolic_reward_partial_score/mean": 0.8522135019302368, + "rewards/symbolic_reward_partial_score/std": 0.3169857859611511, + "rewards/tag_count_reward/mean": -0.130859375, + "rewards/tag_count_reward/std": 0.33757632970809937, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1356170177459717, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 258.0, + "sampling/sampling_logp_difference/mean": 3.2635445594787598, + "step": 1117 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.323417529463768, + "epoch": 2.942105263157895, + "grad_norm": 13.859256744384766, + "learning_rate": 1e-06, + "loss": 0.2917, + "step": 1118 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3256514072418213, + "epoch": 2.944736842105263, + "grad_norm": 0.9161269664764404, + "learning_rate": 1e-06, + "loss": 0.2028, + "step": 1119 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.3238846957683563, + "epoch": 2.9473684210526314, + "grad_norm": 2.7128119468688965, + "learning_rate": 1e-06, + "loss": 0.1348, + "step": 1120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16291.0, + "completions/mean_length": 2118.107421875, + "completions/mean_terminated_length": 642.325439453125, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.3269234597682953, + "epoch": 2.95, + "frac_reward_zero_std": 0.15625, + "grad_norm": 149.16152954101562, + "learning_rate": 1e-06, + "loss": 0.1463, + "num_tokens": 360662974.0, + "reward": 0.7428156137466431, + "reward_std": 0.2521783113479614, + "rewards/progression_diversity/mean": -0.039731502532958984, + "rewards/progression_diversity/std": 0.12264791131019592, + "rewards/symbolic_reward_accuracy/mean": 0.810546875, + "rewards/symbolic_reward_accuracy/std": 0.3922513723373413, + "rewards/symbolic_reward_partial_score/mean": 0.8855794072151184, + "rewards/symbolic_reward_partial_score/std": 0.2812291979789734, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1260616779327393, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 260.0, + "sampling/sampling_logp_difference/mean": 2.9290082454681396, + "step": 1121 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3308548033237457, + "epoch": 2.9526315789473685, + "grad_norm": 105.0246353149414, + "learning_rate": 1e-06, + "loss": 0.2037, + "step": 1122 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.3392486423254013, + "epoch": 2.955263157894737, + "grad_norm": 1.2266844511032104, + "learning_rate": 1e-06, + "loss": 0.2545, + "step": 1123 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.33492106199264526, + "epoch": 2.957894736842105, + "grad_norm": 0.017569491639733315, + "learning_rate": 1e-06, + "loss": 0.2375, + "step": 1124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16326.0, + "completions/mean_length": 2407.78125, + "completions/mean_terminated_length": 656.914306640625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.3292433023452759, + "epoch": 2.9605263157894735, + "frac_reward_zero_std": 0.3125, + "grad_norm": 124.67893981933594, + "learning_rate": 1e-06, + "loss": 0.1289, + "num_tokens": 362307246.0, + "reward": 0.7468899488449097, + "reward_std": 0.2119377851486206, + "rewards/progression_diversity/mean": -0.047334060072898865, + "rewards/progression_diversity/std": 0.13447138667106628, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.8844400644302368, + "rewards/symbolic_reward_partial_score/std": 0.29540929198265076, + "rewards/tag_count_reward/mean": -0.11328125, + "rewards/tag_count_reward/std": 0.3172462284564972, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.115929365158081, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 261.95855712890625, + "sampling/sampling_logp_difference/mean": 2.85267972946167, + "step": 1125 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.33551347255706787, + "epoch": 2.963157894736842, + "grad_norm": 46.468929290771484, + "learning_rate": 1e-06, + "loss": 0.2668, + "step": 1126 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3421187102794647, + "epoch": 2.9657894736842105, + "grad_norm": 0.010265377350151539, + "learning_rate": 1e-06, + "loss": 0.1918, + "step": 1127 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.33866679668426514, + "epoch": 2.968421052631579, + "grad_norm": 0.012180095538496971, + "learning_rate": 1e-06, + "loss": 0.2088, + "step": 1128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.115234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12667.0, + "completions/mean_length": 2409.67578125, + "completions/mean_terminated_length": 589.6203002929688, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.32998014986515045, + "epoch": 2.9710526315789476, + "frac_reward_zero_std": 0.1875, + "grad_norm": 134.6818084716797, + "learning_rate": 1e-06, + "loss": 0.2747, + "num_tokens": 363944808.0, + "reward": 0.7039549350738525, + "reward_std": 0.259724497795105, + "rewards/progression_diversity/mean": -0.04884590953588486, + "rewards/progression_diversity/std": 0.13787634670734406, + "rewards/symbolic_reward_accuracy/mean": 0.76953125, + "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, + "rewards/symbolic_reward_partial_score/mean": 0.8474935293197632, + "rewards/symbolic_reward_partial_score/std": 0.3241187334060669, + "rewards/tag_count_reward/mean": -0.115234375, + "rewards/tag_count_reward/std": 0.3196168541908264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1256601810455322, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 262.0, + "sampling/sampling_logp_difference/mean": 3.3490161895751953, + "step": 1129 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3265266716480255, + "epoch": 2.973684210526316, + "grad_norm": 0.2271856814622879, + "learning_rate": 1e-06, + "loss": 0.2083, + "step": 1130 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3257727771997452, + "epoch": 2.9763157894736842, + "grad_norm": 0.01293912809342146, + "learning_rate": 1e-06, + "loss": 0.2484, + "step": 1131 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.32893073558807373, + "epoch": 2.9789473684210526, + "grad_norm": 0.01243510190397501, + "learning_rate": 1e-06, + "loss": 0.1611, + "step": 1132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16267.0, + "completions/mean_length": 2278.9140625, + "completions/mean_terminated_length": 684.426025390625, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.3159499317407608, + "epoch": 2.981578947368421, + "frac_reward_zero_std": 0.03125, + "grad_norm": 262.58709716796875, + "learning_rate": 1e-06, + "loss": 0.2098, + "num_tokens": 365549628.0, + "reward": 0.6847292184829712, + "reward_std": 0.29013562202453613, + "rewards/progression_diversity/mean": -0.042704228311777115, + "rewards/progression_diversity/std": 0.12792158126831055, + "rewards/symbolic_reward_accuracy/mean": 0.736328125, + "rewards/symbolic_reward_accuracy/std": 0.4410543739795685, + "rewards/symbolic_reward_partial_score/mean": 0.8444010019302368, + "rewards/symbolic_reward_partial_score/std": 0.31147703528404236, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1283180713653564, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 264.0, + "sampling/sampling_logp_difference/mean": 3.4847636222839355, + "step": 1133 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.3305753022432327, + "epoch": 2.984210526315789, + "grad_norm": 2.827713966369629, + "learning_rate": 1e-06, + "loss": 0.2373, + "step": 1134 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.3246810734272003, + "epoch": 2.986842105263158, + "grad_norm": 30.311552047729492, + "learning_rate": 1e-06, + "loss": 0.199, + "step": 1135 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.32095228135585785, + "epoch": 2.9894736842105263, + "grad_norm": 0.029631303623318672, + "learning_rate": 1e-06, + "loss": 0.2114, + "step": 1136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16215.0, + "completions/mean_length": 1764.580078125, + "completions/mean_terminated_length": 658.90966796875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.3265077769756317, + "epoch": 2.9921052631578946, + "frac_reward_zero_std": 0.4375, + "grad_norm": 68.62413787841797, + "learning_rate": 1e-06, + "loss": 0.136, + "num_tokens": 366860069.0, + "reward": 0.7820947170257568, + "reward_std": 0.12925215065479279, + "rewards/progression_diversity/mean": -0.02784038335084915, + "rewards/progression_diversity/std": 0.09940054267644882, + "rewards/symbolic_reward_accuracy/mean": 0.857421875, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.91845703125, + "rewards/symbolic_reward_partial_score/std": 0.24726058542728424, + "rewards/tag_count_reward/mean": -0.076171875, + "rewards/tag_count_reward/std": 0.26553234457969666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.105454683303833, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 264.0, + "sampling/sampling_logp_difference/mean": 2.1433823108673096, + "step": 1137 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.33374376595020294, + "epoch": 2.9947368421052634, + "grad_norm": 35.08319854736328, + "learning_rate": 1e-06, + "loss": 0.202, + "step": 1138 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.32793696224689484, + "epoch": 2.9973684210526317, + "grad_norm": 5.97829532623291, + "learning_rate": 1e-06, + "loss": 0.1058, + "step": 1139 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3303612768650055, + "epoch": 3.0, + "grad_norm": 61.562416076660156, + "learning_rate": 1e-06, + "loss": 0.1287, + "step": 1140 + }, + { + "epoch": 3.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.04638671875, + "eval_completions/max_length": 16384.0, + "eval_completions/max_terminated_length": 5637.46875, + "eval_completions/mean_length": 1238.202880859375, + "eval_completions/mean_terminated_length": 502.11154556274414, + "eval_completions/min_length": 207.375, + "eval_completions/min_terminated_length": 207.375, + "eval_entropy": 0.33187594451010227, + "eval_frac_reward_zero_std": 0.28515625, + "eval_loss": 0.05398595333099365, + "eval_num_tokens": 366860069.0, + "eval_reward": 0.7990773729979992, + "eval_reward_std": 0.21094764932058752, + "eval_rewards/progression_diversity/mean": -0.020244480154360645, + "eval_rewards/progression_diversity/std": 0.08825050009181723, + "eval_rewards/symbolic_reward_accuracy/mean": 0.87890625, + "eval_rewards/symbolic_reward_accuracy/std": 0.3186751971952617, + "eval_rewards/symbolic_reward_partial_score/mean": 0.9210205078125, + "eval_rewards/symbolic_reward_partial_score/std": 0.22956605115905404, + "eval_rewards/tag_count_reward/mean": -0.043701171875, + "eval_rewards/tag_count_reward/std": 0.19665716541931033, + "eval_runtime": 4075.6424, + "eval_samples_per_second": 0.061, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.084825448691845, + "eval_sampling/importance_sampling_ratio/min": 0.0, + "eval_sampling/sampling_logp_difference/max": 265.8116159439087, + "eval_sampling/sampling_logp_difference/mean": 1.002108539454639, + "eval_steps_per_second": 0.0, + "step": 1140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2218.0, + "completions/mean_length": 2117.115234375, + "completions/mean_terminated_length": 538.7830810546875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.3300452083349228, + "epoch": 3.0026315789473683, + "frac_reward_zero_std": 0.3125, + "grad_norm": 52.12168884277344, + "learning_rate": 1e-06, + "loss": 0.0937, + "num_tokens": 368347840.0, + "reward": 0.7367914915084839, + "reward_std": 0.1933838427066803, + "rewards/progression_diversity/mean": -0.03667115420103073, + "rewards/progression_diversity/std": 0.11381202191114426, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.87451171875, + "rewards/symbolic_reward_partial_score/std": 0.3029172122478485, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1042859554290771, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 266.0, + "sampling/sampling_logp_difference/mean": 2.27083158493042, + "step": 1141 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.32495447993278503, + "epoch": 3.0052631578947366, + "grad_norm": 1.0844265222549438, + "learning_rate": 1e-06, + "loss": 0.1562, + "step": 1142 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.31977418065071106, + "epoch": 3.0078947368421054, + "grad_norm": 32.22896194458008, + "learning_rate": 1e-06, + "loss": 0.0852, + "step": 1143 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.33107973635196686, + "epoch": 3.0105263157894737, + "grad_norm": 0.01271702442318201, + "learning_rate": 1e-06, + "loss": 0.183, + "step": 1144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2963.0, + "completions/mean_length": 1620.275390625, + "completions/mean_terminated_length": 536.9832153320312, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.3337739109992981, + "epoch": 3.013157894736842, + "frac_reward_zero_std": 0.28125, + "grad_norm": 256.0187683105469, + "learning_rate": 1e-06, + "loss": 0.1703, + "num_tokens": 369584397.0, + "reward": 0.7931051254272461, + "reward_std": 0.1887580156326294, + "rewards/progression_diversity/mean": -0.02543444186449051, + "rewards/progression_diversity/std": 0.09666049480438232, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.923828125, + "rewards/symbolic_reward_partial_score/std": 0.23379817605018616, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.108473300933838, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 268.0, + "sampling/sampling_logp_difference/mean": 2.444417953491211, + "step": 1145 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.33417925238609314, + "epoch": 3.0157894736842104, + "grad_norm": 152.89532470703125, + "learning_rate": 1e-06, + "loss": 0.2239, + "step": 1146 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3302992135286331, + "epoch": 3.018421052631579, + "grad_norm": 2.4071531295776367, + "learning_rate": 1e-06, + "loss": 0.1218, + "step": 1147 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.32197025418281555, + "epoch": 3.0210526315789474, + "grad_norm": 0.013167927972972393, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 1148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13552.0, + "completions/mean_length": 1762.455078125, + "completions/mean_terminated_length": 590.263671875, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "entropy": 0.31848709285259247, + "epoch": 3.0236842105263158, + "frac_reward_zero_std": 0.25, + "grad_norm": 167.57388305664062, + "learning_rate": 1e-06, + "loss": 0.0925, + "num_tokens": 370898422.0, + "reward": 0.775731086730957, + "reward_std": 0.21161219477653503, + "rewards/progression_diversity/mean": -0.029432490468025208, + "rewards/progression_diversity/std": 0.1058623269200325, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.8986002802848816, + "rewards/symbolic_reward_partial_score/std": 0.2755959630012512, + "rewards/tag_count_reward/mean": -0.068359375, + "rewards/tag_count_reward/std": 0.25260838866233826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1008307933807373, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 268.0, + "sampling/sampling_logp_difference/mean": 2.5271763801574707, + "step": 1149 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.32427777349948883, + "epoch": 3.026315789473684, + "grad_norm": 0.09036588668823242, + "learning_rate": 1e-06, + "loss": 0.1532, + "step": 1150 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.32039374113082886, + "epoch": 3.028947368421053, + "grad_norm": 0.03532138466835022, + "learning_rate": 1e-06, + "loss": 0.1982, + "step": 1151 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3186514228582382, + "epoch": 3.031578947368421, + "grad_norm": 0.011541608721017838, + "learning_rate": 1e-06, + "loss": 0.1511, + "step": 1152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9474.0, + "completions/mean_length": 1170.345703125, + "completions/mean_terminated_length": 551.9044799804688, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.3213903158903122, + "epoch": 3.0342105263157895, + "frac_reward_zero_std": 0.4375, + "grad_norm": 25.063764572143555, + "learning_rate": 1e-06, + "loss": 0.0644, + "num_tokens": 371893767.0, + "reward": 0.8350409269332886, + "reward_std": 0.14898480474948883, + "rewards/progression_diversity/mean": -0.016419440507888794, + "rewards/progression_diversity/std": 0.08188439160585403, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9454752802848816, + "rewards/symbolic_reward_partial_score/std": 0.20989501476287842, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.088158369064331, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 268.0, + "sampling/sampling_logp_difference/mean": 1.5961803197860718, + "step": 1153 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3240705579519272, + "epoch": 3.036842105263158, + "grad_norm": 0.00562731409445405, + "learning_rate": 1e-06, + "loss": 0.1014, + "step": 1154 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.32319276034832, + "epoch": 3.039473684210526, + "grad_norm": 0.006894012447446585, + "learning_rate": 1e-06, + "loss": 0.097, + "step": 1155 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3254219591617584, + "epoch": 3.042105263157895, + "grad_norm": 0.0085447384044528, + "learning_rate": 1e-06, + "loss": 0.0831, + "step": 1156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14955.0, + "completions/mean_length": 1684.576171875, + "completions/mean_terminated_length": 606.0020751953125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.3363809883594513, + "epoch": 3.044736842105263, + "frac_reward_zero_std": 0.28125, + "grad_norm": 185.30889892578125, + "learning_rate": 1e-06, + "loss": 0.1307, + "num_tokens": 373167598.0, + "reward": 0.7592610120773315, + "reward_std": 0.2298196256160736, + "rewards/progression_diversity/mean": -0.026055842638015747, + "rewards/progression_diversity/std": 0.09842479228973389, + "rewards/symbolic_reward_accuracy/mean": 0.830078125, + "rewards/symbolic_reward_accuracy/std": 0.3759314715862274, + "rewards/symbolic_reward_partial_score/mean": 0.89306640625, + "rewards/symbolic_reward_partial_score/std": 0.2722471356391907, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0948313474655151, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 270.0, + "sampling/sampling_logp_difference/mean": 2.451362133026123, + "step": 1157 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.32949453592300415, + "epoch": 3.0473684210526315, + "grad_norm": 0.15845933556556702, + "learning_rate": 1e-06, + "loss": 0.15, + "step": 1158 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3209853768348694, + "epoch": 3.05, + "grad_norm": 0.012255529873073101, + "learning_rate": 1e-06, + "loss": 0.1257, + "step": 1159 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3235623836517334, + "epoch": 3.0526315789473686, + "grad_norm": 0.0065957182087004185, + "learning_rate": 1e-06, + "loss": 0.1606, + "step": 1160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10149.0, + "completions/mean_length": 1787.939453125, + "completions/mean_terminated_length": 584.458740234375, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.3329770565032959, + "epoch": 3.055263157894737, + "frac_reward_zero_std": 0.3125, + "grad_norm": 219.4588165283203, + "learning_rate": 1e-06, + "loss": 0.1499, + "num_tokens": 374477807.0, + "reward": 0.7548931837081909, + "reward_std": 0.1553860455751419, + "rewards/progression_diversity/mean": -0.02826322615146637, + "rewards/progression_diversity/std": 0.10057384520769119, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.8968098759651184, + "rewards/symbolic_reward_partial_score/std": 0.26359865069389343, + "rewards/tag_count_reward/mean": -0.072265625, + "rewards/tag_count_reward/std": 0.2591804563999176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0940297842025757, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 272.0, + "sampling/sampling_logp_difference/mean": 2.350238084793091, + "step": 1161 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.3281650245189667, + "epoch": 3.057894736842105, + "grad_norm": 0.007984149269759655, + "learning_rate": 1e-06, + "loss": 0.2052, + "step": 1162 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3284861296415329, + "epoch": 3.0605263157894735, + "grad_norm": 0.035218384116888046, + "learning_rate": 1e-06, + "loss": 0.12, + "step": 1163 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3274737000465393, + "epoch": 3.0631578947368423, + "grad_norm": 0.012610013596713543, + "learning_rate": 1e-06, + "loss": 0.1512, + "step": 1164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1523.06640625, + "completions/mean_terminated_length": 532.3375244140625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "entropy": 0.3336295783519745, + "epoch": 3.0657894736842106, + "frac_reward_zero_std": 0.4375, + "grad_norm": 120.76913452148438, + "learning_rate": 1e-06, + "loss": 0.1477, + "num_tokens": 375667281.0, + "reward": 0.8087124824523926, + "reward_std": 0.15946154296398163, + "rewards/progression_diversity/mean": -0.022307991981506348, + "rewards/progression_diversity/std": 0.08901267498731613, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.93212890625, + "rewards/symbolic_reward_partial_score/std": 0.231436625123024, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0914645195007324, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 272.0, + "sampling/sampling_logp_difference/mean": 1.7476840019226074, + "step": 1165 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.32854387164115906, + "epoch": 3.068421052631579, + "grad_norm": 0.015112695284187794, + "learning_rate": 1e-06, + "loss": 0.1242, + "step": 1166 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3354027718305588, + "epoch": 3.0710526315789473, + "grad_norm": 0.6051515340805054, + "learning_rate": 1e-06, + "loss": 0.0242, + "step": 1167 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3323533236980438, + "epoch": 3.0736842105263156, + "grad_norm": 0.01162874884903431, + "learning_rate": 1e-06, + "loss": 0.2038, + "step": 1168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2259.0, + "completions/mean_length": 1519.060546875, + "completions/mean_terminated_length": 528.0646362304688, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.34026047587394714, + "epoch": 3.0763157894736843, + "frac_reward_zero_std": 0.34375, + "grad_norm": 21.67325210571289, + "learning_rate": 1e-06, + "loss": 0.0595, + "num_tokens": 376820816.0, + "reward": 0.8038376569747925, + "reward_std": 0.16369682550430298, + "rewards/progression_diversity/mean": -0.021513454616069794, + "rewards/progression_diversity/std": 0.08785874396562576, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9295247197151184, + "rewards/symbolic_reward_partial_score/std": 0.22601304948329926, + "rewards/tag_count_reward/mean": -0.056640625, + "rewards/tag_count_reward/std": 0.23138070106506348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0951886177062988, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 272.0, + "sampling/sampling_logp_difference/mean": 1.9224200248718262, + "step": 1169 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.33099737763404846, + "epoch": 3.0789473684210527, + "grad_norm": 0.010425153188407421, + "learning_rate": 1e-06, + "loss": 0.1494, + "step": 1170 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3235698342323303, + "epoch": 3.081578947368421, + "grad_norm": 0.013261355459690094, + "learning_rate": 1e-06, + "loss": 0.1044, + "step": 1171 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3331771790981293, + "epoch": 3.0842105263157893, + "grad_norm": 0.008659729734063148, + "learning_rate": 1e-06, + "loss": 0.2015, + "step": 1172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15931.0, + "completions/mean_length": 1108.689453125, + "completions/mean_terminated_length": 552.0991821289062, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.33606918156147003, + "epoch": 3.086842105263158, + "frac_reward_zero_std": 0.46875, + "grad_norm": 340.97711181640625, + "learning_rate": 1e-06, + "loss": 0.1414, + "num_tokens": 377781745.0, + "reward": 0.8523913025856018, + "reward_std": 0.13625648617744446, + "rewards/progression_diversity/mean": -0.014778539538383484, + "rewards/progression_diversity/std": 0.07598426192998886, + "rewards/symbolic_reward_accuracy/mean": 0.947265625, + "rewards/symbolic_reward_accuracy/std": 0.22372129559516907, + "rewards/symbolic_reward_partial_score/mean": 0.9596354365348816, + "rewards/symbolic_reward_partial_score/std": 0.185786634683609, + "rewards/tag_count_reward/mean": -0.037109375, + "rewards/tag_count_reward/std": 0.18921469151973724, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0950132608413696, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 274.0, + "sampling/sampling_logp_difference/mean": 1.4215747117996216, + "step": 1173 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3379185199737549, + "epoch": 3.0894736842105264, + "grad_norm": 0.018991535529494286, + "learning_rate": 1e-06, + "loss": 0.1565, + "step": 1174 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.33449527621269226, + "epoch": 3.0921052631578947, + "grad_norm": 0.005514142569154501, + "learning_rate": 1e-06, + "loss": 0.062, + "step": 1175 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.32939158380031586, + "epoch": 3.094736842105263, + "grad_norm": 0.005196165293455124, + "learning_rate": 1e-06, + "loss": 0.0599, + "step": 1176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15172.0, + "completions/mean_length": 1775.599609375, + "completions/mean_terminated_length": 604.4619750976562, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.3369821459054947, + "epoch": 3.0973684210526318, + "frac_reward_zero_std": 0.3125, + "grad_norm": 145.88417053222656, + "learning_rate": 1e-06, + "loss": 0.1591, + "num_tokens": 379096324.0, + "reward": 0.7732741236686707, + "reward_std": 0.21388080716133118, + "rewards/progression_diversity/mean": -0.026106324046850204, + "rewards/progression_diversity/std": 0.09331251680850983, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.9078775644302368, + "rewards/symbolic_reward_partial_score/std": 0.2603663206100464, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.103142499923706, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 276.0, + "sampling/sampling_logp_difference/mean": 2.0799942016601562, + "step": 1177 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.34052442014217377, + "epoch": 3.1, + "grad_norm": 0.011826397851109505, + "learning_rate": 1e-06, + "loss": 0.1395, + "step": 1178 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3255487084388733, + "epoch": 3.1026315789473684, + "grad_norm": 0.008594873361289501, + "learning_rate": 1e-06, + "loss": 0.1081, + "step": 1179 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3336612284183502, + "epoch": 3.1052631578947367, + "grad_norm": 0.01105382852256298, + "learning_rate": 1e-06, + "loss": 0.1773, + "step": 1180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2873.0, + "completions/mean_length": 1904.052734375, + "completions/mean_terminated_length": 576.4669799804688, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.3368203490972519, + "epoch": 3.1078947368421055, + "frac_reward_zero_std": 0.28125, + "grad_norm": 61.46799087524414, + "learning_rate": 1e-06, + "loss": 0.0739, + "num_tokens": 380476511.0, + "reward": 0.7395263910293579, + "reward_std": 0.2011542022228241, + "rewards/progression_diversity/mean": -0.026853924617171288, + "rewards/progression_diversity/std": 0.09222765266895294, + "rewards/symbolic_reward_accuracy/mean": 0.802734375, + "rewards/symbolic_reward_accuracy/std": 0.3983237147331238, + "rewards/symbolic_reward_partial_score/mean": 0.88525390625, + "rewards/symbolic_reward_partial_score/std": 0.2737979590892792, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0973585844039917, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 276.0, + "sampling/sampling_logp_difference/mean": 1.8168814182281494, + "step": 1181 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3273409307003021, + "epoch": 3.110526315789474, + "grad_norm": 0.014104950241744518, + "learning_rate": 1e-06, + "loss": 0.1636, + "step": 1182 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.315226212143898, + "epoch": 3.113157894736842, + "grad_norm": 0.3530890643596649, + "learning_rate": 1e-06, + "loss": 0.1942, + "step": 1183 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3299044966697693, + "epoch": 3.1157894736842104, + "grad_norm": 0.017564065754413605, + "learning_rate": 1e-06, + "loss": 0.1096, + "step": 1184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10168.0, + "completions/mean_length": 1682.30859375, + "completions/mean_terminated_length": 603.568115234375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.32700584828853607, + "epoch": 3.1184210526315788, + "frac_reward_zero_std": 0.34375, + "grad_norm": 11.54803466796875, + "learning_rate": 1e-06, + "loss": 0.0785, + "num_tokens": 381744989.0, + "reward": 0.7641712427139282, + "reward_std": 0.18420130014419556, + "rewards/progression_diversity/mean": -0.023311495780944824, + "rewards/progression_diversity/std": 0.08777539432048798, + "rewards/symbolic_reward_accuracy/mean": 0.83203125, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.90478515625, + "rewards/symbolic_reward_partial_score/std": 0.2593802213668823, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0980756282806396, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 278.0, + "sampling/sampling_logp_difference/mean": 1.7065038681030273, + "step": 1185 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3249793201684952, + "epoch": 3.1210526315789475, + "grad_norm": 242.987060546875, + "learning_rate": 1e-06, + "loss": 0.157, + "step": 1186 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3353985846042633, + "epoch": 3.123684210526316, + "grad_norm": 0.011672238819301128, + "learning_rate": 1e-06, + "loss": 0.164, + "step": 1187 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3257022500038147, + "epoch": 3.126315789473684, + "grad_norm": 0.03746514022350311, + "learning_rate": 1e-06, + "loss": 0.1344, + "step": 1188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2665.0, + "completions/mean_length": 2012.21484375, + "completions/mean_terminated_length": 593.540771484375, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.326521098613739, + "epoch": 3.1289473684210525, + "frac_reward_zero_std": 0.28125, + "grad_norm": 233.8143310546875, + "learning_rate": 1e-06, + "loss": 0.1006, + "num_tokens": 383184043.0, + "reward": 0.7504065632820129, + "reward_std": 0.20918390154838562, + "rewards/progression_diversity/mean": -0.027706898748874664, + "rewards/progression_diversity/std": 0.09274362027645111, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.8850911855697632, + "rewards/symbolic_reward_partial_score/std": 0.289438933134079, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1031694412231445, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 278.0, + "sampling/sampling_logp_difference/mean": 1.8653408288955688, + "step": 1189 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.32067760825157166, + "epoch": 3.1315789473684212, + "grad_norm": 49.3584098815918, + "learning_rate": 1e-06, + "loss": 0.1867, + "step": 1190 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3191855549812317, + "epoch": 3.1342105263157896, + "grad_norm": 0.021937908604741096, + "learning_rate": 1e-06, + "loss": 0.2106, + "step": 1191 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.30871887505054474, + "epoch": 3.136842105263158, + "grad_norm": 0.017672132700681686, + "learning_rate": 1e-06, + "loss": 0.1299, + "step": 1192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 7855.0, + "completions/mean_length": 2210.833984375, + "completions/mean_terminated_length": 608.6499633789062, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.3056802898645401, + "epoch": 3.139473684210526, + "frac_reward_zero_std": 0.1875, + "grad_norm": 79.18196868896484, + "learning_rate": 1e-06, + "loss": 0.2021, + "num_tokens": 384727958.0, + "reward": 0.754828691482544, + "reward_std": 0.21231617033481598, + "rewards/progression_diversity/mean": -0.03471089154481888, + "rewards/progression_diversity/std": 0.10704705864191055, + "rewards/symbolic_reward_accuracy/mean": 0.83203125, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.8811848759651184, + "rewards/symbolic_reward_partial_score/std": 0.29755792021751404, + "rewards/tag_count_reward/mean": -0.083984375, + "rewards/tag_count_reward/std": 0.2776356339454651, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1085774898529053, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 280.0, + "sampling/sampling_logp_difference/mean": 2.3273110389709473, + "step": 1193 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3188657760620117, + "epoch": 3.1421052631578945, + "grad_norm": 0.007435488048940897, + "learning_rate": 1e-06, + "loss": 0.1936, + "step": 1194 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.31570450961589813, + "epoch": 3.1447368421052633, + "grad_norm": 0.71424400806427, + "learning_rate": 1e-06, + "loss": 0.1574, + "step": 1195 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.390625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.3103174865245819, + "epoch": 3.1473684210526316, + "grad_norm": 2.483917474746704, + "learning_rate": 1e-06, + "loss": 0.2062, + "step": 1196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11863.0, + "completions/mean_length": 2258.6640625, + "completions/mean_terminated_length": 593.2314453125, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "entropy": 0.3229822814464569, + "epoch": 3.15, + "frac_reward_zero_std": 0.25, + "grad_norm": 142.69839477539062, + "learning_rate": 1e-06, + "loss": 0.1636, + "num_tokens": 386288202.0, + "reward": 0.7562662363052368, + "reward_std": 0.2117644101381302, + "rewards/progression_diversity/mean": -0.032554611563682556, + "rewards/progression_diversity/std": 0.09795653074979782, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.8787435293197632, + "rewards/symbolic_reward_partial_score/std": 0.3041481375694275, + "rewards/tag_count_reward/mean": -0.09765625, + "rewards/tag_count_reward/std": 0.29713961482048035, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.106873869895935, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 280.0, + "sampling/sampling_logp_difference/mean": 1.8862922191619873, + "step": 1197 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.31533288955688477, + "epoch": 3.1526315789473682, + "grad_norm": 63.04111099243164, + "learning_rate": 1e-06, + "loss": 0.1067, + "step": 1198 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3111482411623001, + "epoch": 3.155263157894737, + "grad_norm": 0.010430481284856796, + "learning_rate": 1e-06, + "loss": 0.2744, + "step": 1199 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.32699713110923767, + "epoch": 3.1578947368421053, + "grad_norm": 79.19801330566406, + "learning_rate": 1e-06, + "loss": 0.1564, + "step": 1200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2453.0, + "completions/mean_length": 2007.53125, + "completions/mean_terminated_length": 588.3948364257812, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.3115497827529907, + "epoch": 3.1605263157894736, + "frac_reward_zero_std": 0.28125, + "grad_norm": 73.29586029052734, + "learning_rate": 1e-06, + "loss": 0.1916, + "num_tokens": 387721850.0, + "reward": 0.7630906701087952, + "reward_std": 0.19893735647201538, + "rewards/progression_diversity/mean": -0.028826594352722168, + "rewards/progression_diversity/std": 0.09443072229623795, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.8981119394302368, + "rewards/symbolic_reward_partial_score/std": 0.2708629369735718, + "rewards/tag_count_reward/mean": -0.076171875, + "rewards/tag_count_reward/std": 0.26553234457969666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.108184576034546, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 282.0, + "sampling/sampling_logp_difference/mean": 2.00418758392334, + "step": 1201 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.32552970945835114, + "epoch": 3.163157894736842, + "grad_norm": 6.941614627838135, + "learning_rate": 1e-06, + "loss": 0.1422, + "step": 1202 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.32488586008548737, + "epoch": 3.1657894736842107, + "grad_norm": 0.01468179002404213, + "learning_rate": 1e-06, + "loss": 0.1705, + "step": 1203 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.30805665254592896, + "epoch": 3.168421052631579, + "grad_norm": 0.013095368631184101, + "learning_rate": 1e-06, + "loss": 0.2168, + "step": 1204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9420.0, + "completions/mean_length": 1713.927734375, + "completions/mean_terminated_length": 571.206298828125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.3258505165576935, + "epoch": 3.1710526315789473, + "frac_reward_zero_std": 0.34375, + "grad_norm": 112.94711303710938, + "learning_rate": 1e-06, + "loss": 0.1194, + "num_tokens": 388982517.0, + "reward": 0.8024517297744751, + "reward_std": 0.18236470222473145, + "rewards/progression_diversity/mean": -0.023384764790534973, + "rewards/progression_diversity/std": 0.08616573363542557, + "rewards/symbolic_reward_accuracy/mean": 0.888671875, + "rewards/symbolic_reward_accuracy/std": 0.31484565138816833, + "rewards/symbolic_reward_partial_score/mean": 0.9152017831802368, + "rewards/symbolic_reward_partial_score/std": 0.25712403655052185, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1038358211517334, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 284.0, + "sampling/sampling_logp_difference/mean": 1.693922996520996, + "step": 1205 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3253411054611206, + "epoch": 3.1736842105263157, + "grad_norm": 4.749903202056885, + "learning_rate": 1e-06, + "loss": 0.229, + "step": 1206 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.32736538350582123, + "epoch": 3.1763157894736844, + "grad_norm": 0.008272160775959492, + "learning_rate": 1e-06, + "loss": 0.1404, + "step": 1207 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.32286109030246735, + "epoch": 3.1789473684210527, + "grad_norm": 0.004888339899480343, + "learning_rate": 1e-06, + "loss": 0.1054, + "step": 1208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15292.0, + "completions/mean_length": 1806.24609375, + "completions/mean_terminated_length": 570.8432006835938, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.3235931098461151, + "epoch": 3.181578947368421, + "frac_reward_zero_std": 0.40625, + "grad_norm": 25.39488983154297, + "learning_rate": 1e-06, + "loss": 0.1452, + "num_tokens": 390291091.0, + "reward": 0.8004156351089478, + "reward_std": 0.1887449473142624, + "rewards/progression_diversity/mean": -0.026793645694851875, + "rewards/progression_diversity/std": 0.09325826168060303, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9176431894302368, + "rewards/symbolic_reward_partial_score/std": 0.25725844502449036, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1049944162368774, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 288.0, + "sampling/sampling_logp_difference/mean": 1.934594750404358, + "step": 1209 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3248671740293503, + "epoch": 3.1842105263157894, + "grad_norm": 0.007971227169036865, + "learning_rate": 1e-06, + "loss": 0.1074, + "step": 1210 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.311784565448761, + "epoch": 3.1868421052631577, + "grad_norm": 15.894944190979004, + "learning_rate": 1e-06, + "loss": 0.188, + "step": 1211 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3261888176202774, + "epoch": 3.1894736842105265, + "grad_norm": 0.019114594906568527, + "learning_rate": 1e-06, + "loss": 0.1449, + "step": 1212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9429.0, + "completions/mean_length": 1767.72265625, + "completions/mean_terminated_length": 629.1915283203125, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.3183141201734543, + "epoch": 3.192105263157895, + "frac_reward_zero_std": 0.3125, + "grad_norm": 201.4045867919922, + "learning_rate": 1e-06, + "loss": 0.1038, + "num_tokens": 391601637.0, + "reward": 0.793647050857544, + "reward_std": 0.20239751040935516, + "rewards/progression_diversity/mean": -0.02494898810982704, + "rewards/progression_diversity/std": 0.08966317027807236, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.9158528447151184, + "rewards/symbolic_reward_partial_score/std": 0.2540964186191559, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1079578399658203, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 290.0, + "sampling/sampling_logp_difference/mean": 2.028384208679199, + "step": 1213 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.31385740637779236, + "epoch": 3.194736842105263, + "grad_norm": 0.025811778381466866, + "learning_rate": 1e-06, + "loss": 0.1903, + "step": 1214 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3017711192369461, + "epoch": 3.1973684210526314, + "grad_norm": 148.46176147460938, + "learning_rate": 1e-06, + "loss": 0.243, + "step": 1215 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3152967840433121, + "epoch": 3.2, + "grad_norm": 6.499451160430908, + "learning_rate": 1e-06, + "loss": 0.0559, + "step": 1216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12806.0, + "completions/mean_length": 2251.021484375, + "completions/mean_terminated_length": 653.3804321289062, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.30623379349708557, + "epoch": 3.2026315789473685, + "frac_reward_zero_std": 0.28125, + "grad_norm": 91.87493133544922, + "learning_rate": 1e-06, + "loss": 0.1956, + "num_tokens": 393178320.0, + "reward": 0.7417978048324585, + "reward_std": 0.22114922106266022, + "rewards/progression_diversity/mean": -0.029206208884716034, + "rewards/progression_diversity/std": 0.09235595166683197, + "rewards/symbolic_reward_accuracy/mean": 0.81640625, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.8688150644302368, + "rewards/symbolic_reward_partial_score/std": 0.30946817994117737, + "rewards/tag_count_reward/mean": -0.083984375, + "rewards/tag_count_reward/std": 0.2776356339454651, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1069762706756592, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 292.0, + "sampling/sampling_logp_difference/mean": 1.9247076511383057, + "step": 1217 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.30731241405010223, + "epoch": 3.205263157894737, + "grad_norm": 6.715027809143066, + "learning_rate": 1e-06, + "loss": 0.1788, + "step": 1218 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2882115989923477, + "epoch": 3.207894736842105, + "grad_norm": 0.01562909595668316, + "learning_rate": 1e-06, + "loss": 0.2044, + "step": 1219 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.30942070484161377, + "epoch": 3.2105263157894735, + "grad_norm": 0.34386348724365234, + "learning_rate": 1e-06, + "loss": 0.1043, + "step": 1220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2114.0, + "completions/mean_length": 2317.568359375, + "completions/mean_terminated_length": 555.4000244140625, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.3095013201236725, + "epoch": 3.213157894736842, + "frac_reward_zero_std": 0.15625, + "grad_norm": 89.25138092041016, + "learning_rate": 1e-06, + "loss": 0.134, + "num_tokens": 394758899.0, + "reward": 0.7222451567649841, + "reward_std": 0.2374163269996643, + "rewards/progression_diversity/mean": -0.03622689098119736, + "rewards/progression_diversity/std": 0.10711545497179031, + "rewards/symbolic_reward_accuracy/mean": 0.7890625, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.86572265625, + "rewards/symbolic_reward_partial_score/std": 0.30833473801612854, + "rewards/tag_count_reward/mean": -0.10546875, + "rewards/tag_count_reward/std": 0.3074568510055542, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.108048915863037, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 292.0, + "sampling/sampling_logp_difference/mean": 2.0354158878326416, + "step": 1221 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.30622752010822296, + "epoch": 3.2157894736842105, + "grad_norm": 0.01329483650624752, + "learning_rate": 1e-06, + "loss": 0.2269, + "step": 1222 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2995607703924179, + "epoch": 3.218421052631579, + "grad_norm": 0.011663331650197506, + "learning_rate": 1e-06, + "loss": 0.1555, + "step": 1223 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.29136380553245544, + "epoch": 3.221052631578947, + "grad_norm": 0.02316325716674328, + "learning_rate": 1e-06, + "loss": 0.2399, + "step": 1224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14746.0, + "completions/mean_length": 1491.333984375, + "completions/mean_terminated_length": 629.7747802734375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.31069743633270264, + "epoch": 3.223684210526316, + "frac_reward_zero_std": 0.3125, + "grad_norm": 37.16641616821289, + "learning_rate": 1e-06, + "loss": 0.1773, + "num_tokens": 395935454.0, + "reward": 0.8113774061203003, + "reward_std": 0.20781219005584717, + "rewards/progression_diversity/mean": -0.019492559134960175, + "rewards/progression_diversity/std": 0.07982630282640457, + "rewards/symbolic_reward_accuracy/mean": 0.900390625, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.9226887822151184, + "rewards/symbolic_reward_partial_score/std": 0.2527928054332733, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.096280813217163, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 292.0, + "sampling/sampling_logp_difference/mean": 1.8208186626434326, + "step": 1225 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.30570439994335175, + "epoch": 3.2263157894736842, + "grad_norm": 0.007866773754358292, + "learning_rate": 1e-06, + "loss": 0.215, + "step": 1226 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.32926367223262787, + "epoch": 3.2289473684210526, + "grad_norm": 0.04123647138476372, + "learning_rate": 1e-06, + "loss": 0.0619, + "step": 1227 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.32413050532341003, + "epoch": 3.231578947368421, + "grad_norm": 0.04225276783108711, + "learning_rate": 1e-06, + "loss": 0.1256, + "step": 1228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10278.0, + "completions/mean_length": 1581.517578125, + "completions/mean_terminated_length": 594.6854248046875, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.3033265024423599, + "epoch": 3.2342105263157896, + "frac_reward_zero_std": 0.34375, + "grad_norm": 168.14776611328125, + "learning_rate": 1e-06, + "loss": 0.2286, + "num_tokens": 397158023.0, + "reward": 0.8106723427772522, + "reward_std": 0.19477233290672302, + "rewards/progression_diversity/mean": -0.021638944745063782, + "rewards/progression_diversity/std": 0.08424000442028046, + "rewards/symbolic_reward_accuracy/mean": 0.900390625, + "rewards/symbolic_reward_accuracy/std": 0.29977133870124817, + "rewards/symbolic_reward_partial_score/mean": 0.9230143427848816, + "rewards/symbolic_reward_partial_score/std": 0.2554587125778198, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0959186553955078, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 292.0, + "sampling/sampling_logp_difference/mean": 1.717806100845337, + "step": 1229 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.32844409346580505, + "epoch": 3.236842105263158, + "grad_norm": 10.585503578186035, + "learning_rate": 1e-06, + "loss": 0.1134, + "step": 1230 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31832896173000336, + "epoch": 3.2394736842105263, + "grad_norm": 1.0383377075195312, + "learning_rate": 1e-06, + "loss": 0.1613, + "step": 1231 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.33217278122901917, + "epoch": 3.2421052631578946, + "grad_norm": 0.013041791506111622, + "learning_rate": 1e-06, + "loss": 0.0293, + "step": 1232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13950.0, + "completions/mean_length": 1890.353515625, + "completions/mean_terminated_length": 561.51171875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.30604319274425507, + "epoch": 3.2447368421052634, + "frac_reward_zero_std": 0.25, + "grad_norm": 29.466703414916992, + "learning_rate": 1e-06, + "loss": 0.1197, + "num_tokens": 398532860.0, + "reward": 0.7769619226455688, + "reward_std": 0.22913838922977448, + "rewards/progression_diversity/mean": -0.028422629460692406, + "rewards/progression_diversity/std": 0.09470929205417633, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.9046223759651184, + "rewards/symbolic_reward_partial_score/std": 0.27007627487182617, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.101804256439209, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 294.0, + "sampling/sampling_logp_difference/mean": 1.9203028678894043, + "step": 1233 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.31168776750564575, + "epoch": 3.2473684210526317, + "grad_norm": 59.52106857299805, + "learning_rate": 1e-06, + "loss": 0.2153, + "step": 1234 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3222711682319641, + "epoch": 3.25, + "grad_norm": 0.33637794852256775, + "learning_rate": 1e-06, + "loss": 0.0839, + "step": 1235 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.30836866796016693, + "epoch": 3.2526315789473683, + "grad_norm": 0.019555335864424706, + "learning_rate": 1e-06, + "loss": 0.1737, + "step": 1236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.169921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12113.0, + "completions/mean_length": 3351.146484375, + "completions/mean_terminated_length": 683.2446899414062, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.2953759431838989, + "epoch": 3.2552631578947366, + "frac_reward_zero_std": 0.21875, + "grad_norm": 87.26698303222656, + "learning_rate": 1e-06, + "loss": 0.1332, + "num_tokens": 400674471.0, + "reward": 0.6438003182411194, + "reward_std": 0.23994354903697968, + "rewards/progression_diversity/mean": -0.05356261506676674, + "rewards/progression_diversity/std": 0.12248878180980682, + "rewards/symbolic_reward_accuracy/mean": 0.703125, + "rewards/symbolic_reward_accuracy/std": 0.45732781291007996, + "rewards/symbolic_reward_partial_score/mean": 0.7936197519302368, + "rewards/symbolic_reward_partial_score/std": 0.36749520897865295, + "rewards/tag_count_reward/mean": -0.15625, + "rewards/tag_count_reward/std": 0.36344730854034424, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1080725193023682, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 294.0, + "sampling/sampling_logp_difference/mean": 2.1226110458374023, + "step": 1237 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.30956192314624786, + "epoch": 3.2578947368421054, + "grad_norm": 9.49582290649414, + "learning_rate": 1e-06, + "loss": 0.1708, + "step": 1238 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.21875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.27571606636047363, + "epoch": 3.2605263157894737, + "grad_norm": 0.008884378708899021, + "learning_rate": 1e-06, + "loss": 0.3496, + "step": 1239 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.27126601338386536, + "epoch": 3.263157894736842, + "grad_norm": 18.252225875854492, + "learning_rate": 1e-06, + "loss": 0.2307, + "step": 1240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1876.0, + "completions/mean_length": 1877.640625, + "completions/mean_terminated_length": 513.794921875, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.32085326313972473, + "epoch": 3.2657894736842104, + "frac_reward_zero_std": 0.28125, + "grad_norm": 87.52373504638672, + "learning_rate": 1e-06, + "loss": 0.0879, + "num_tokens": 402027599.0, + "reward": 0.7780733108520508, + "reward_std": 0.20871524512767792, + "rewards/progression_diversity/mean": -0.029581986367702484, + "rewards/progression_diversity/std": 0.09899447858333588, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.90185546875, + "rewards/symbolic_reward_partial_score/std": 0.2770686745643616, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0994126796722412, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 294.0, + "sampling/sampling_logp_difference/mean": 2.019242286682129, + "step": 1241 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.30657218396663666, + "epoch": 3.268421052631579, + "grad_norm": 0.007070607040077448, + "learning_rate": 1e-06, + "loss": 0.1977, + "step": 1242 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.30736126005649567, + "epoch": 3.2710526315789474, + "grad_norm": 0.016214022412896156, + "learning_rate": 1e-06, + "loss": 0.2739, + "step": 1243 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.32670439779758453, + "epoch": 3.2736842105263158, + "grad_norm": 0.010165052488446236, + "learning_rate": 1e-06, + "loss": 0.1227, + "step": 1244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.119140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12475.0, + "completions/mean_length": 2460.5546875, + "completions/mean_terminated_length": 577.3392333984375, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.29750363528728485, + "epoch": 3.276315789473684, + "frac_reward_zero_std": 0.28125, + "grad_norm": 246.82859802246094, + "learning_rate": 1e-06, + "loss": 0.2205, + "num_tokens": 403684715.0, + "reward": 0.7260443568229675, + "reward_std": 0.22644254565238953, + "rewards/progression_diversity/mean": -0.03717077150940895, + "rewards/progression_diversity/std": 0.10484474152326584, + "rewards/symbolic_reward_accuracy/mean": 0.798828125, + "rewards/symbolic_reward_accuracy/std": 0.4012683033943176, + "rewards/symbolic_reward_partial_score/mean": 0.8575845956802368, + "rewards/symbolic_reward_partial_score/std": 0.3189500868320465, + "rewards/tag_count_reward/mean": -0.1015625, + "rewards/tag_count_reward/std": 0.30236753821372986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1037794351577759, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 296.0, + "sampling/sampling_logp_difference/mean": 2.2273709774017334, + "step": 1245 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.31314218044281006, + "epoch": 3.2789473684210524, + "grad_norm": 4.193942070007324, + "learning_rate": 1e-06, + "loss": 0.1196, + "step": 1246 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.309044748544693, + "epoch": 3.281578947368421, + "grad_norm": 0.0659249871969223, + "learning_rate": 1e-06, + "loss": 0.1756, + "step": 1247 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.285678505897522, + "epoch": 3.2842105263157895, + "grad_norm": 0.006575907580554485, + "learning_rate": 1e-06, + "loss": 0.2906, + "step": 1248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2405.0, + "completions/mean_length": 2114.935546875, + "completions/mean_terminated_length": 536.3622436523438, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.30202820897102356, + "epoch": 3.286842105263158, + "frac_reward_zero_std": 0.25, + "grad_norm": 161.16702270507812, + "learning_rate": 1e-06, + "loss": 0.2409, + "num_tokens": 405162026.0, + "reward": 0.7425325512886047, + "reward_std": 0.2102564126253128, + "rewards/progression_diversity/mean": -0.028976155444979668, + "rewards/progression_diversity/std": 0.09275542944669724, + "rewards/symbolic_reward_accuracy/mean": 0.814453125, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.87451171875, + "rewards/symbolic_reward_partial_score/std": 0.3029172122478485, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1059821844100952, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 296.0, + "sampling/sampling_logp_difference/mean": 2.2559051513671875, + "step": 1249 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.32025621831417084, + "epoch": 3.2894736842105265, + "grad_norm": 0.03536779060959816, + "learning_rate": 1e-06, + "loss": 0.1425, + "step": 1250 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.29748618602752686, + "epoch": 3.292105263157895, + "grad_norm": 1.4454317092895508, + "learning_rate": 1e-06, + "loss": 0.2194, + "step": 1251 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3038511127233505, + "epoch": 3.294736842105263, + "grad_norm": 0.013857961632311344, + "learning_rate": 1e-06, + "loss": 0.1161, + "step": 1252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10572.0, + "completions/mean_length": 2446.869140625, + "completions/mean_terminated_length": 596.8074951171875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.3028118759393692, + "epoch": 3.2973684210526315, + "frac_reward_zero_std": 0.125, + "grad_norm": 439.4346008300781, + "learning_rate": 1e-06, + "loss": 0.2289, + "num_tokens": 406818631.0, + "reward": 0.7258546948432922, + "reward_std": 0.24171996116638184, + "rewards/progression_diversity/mean": -0.04148336872458458, + "rewards/progression_diversity/std": 0.1141848936676979, + "rewards/symbolic_reward_accuracy/mean": 0.80078125, + "rewards/symbolic_reward_accuracy/std": 0.39980348944664, + "rewards/symbolic_reward_partial_score/mean": 0.8564453125, + "rewards/symbolic_reward_partial_score/std": 0.33105501532554626, + "rewards/tag_count_reward/mean": -0.111328125, + "rewards/tag_count_reward/std": 0.31484565138816833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1173810958862305, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 296.0, + "sampling/sampling_logp_difference/mean": 3.1980838775634766, + "step": 1253 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.3106544762849808, + "epoch": 3.3, + "grad_norm": 0.009494182653725147, + "learning_rate": 1e-06, + "loss": 0.1479, + "step": 1254 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.2966717481613159, + "epoch": 3.3026315789473686, + "grad_norm": 3.8081188201904297, + "learning_rate": 1e-06, + "loss": 0.2924, + "step": 1255 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.29619279503822327, + "epoch": 3.305263157894737, + "grad_norm": 0.04518857225775719, + "learning_rate": 1e-06, + "loss": 0.31, + "step": 1256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 13429.0, + "completions/mean_length": 2205.7890625, + "completions/mean_terminated_length": 534.1222534179688, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.3126777410507202, + "epoch": 3.307894736842105, + "frac_reward_zero_std": 0.15625, + "grad_norm": 185.87486267089844, + "learning_rate": 1e-06, + "loss": 0.1568, + "num_tokens": 408320763.0, + "reward": 0.7695430517196655, + "reward_std": 0.2663387060165405, + "rewards/progression_diversity/mean": -0.03788202255964279, + "rewards/progression_diversity/std": 0.1108899936079979, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.8873697519302368, + "rewards/symbolic_reward_partial_score/std": 0.2990962564945221, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.1089106798171997, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 296.0, + "sampling/sampling_logp_difference/mean": 2.924543857574463, + "step": 1257 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2874736040830612, + "epoch": 3.3105263157894735, + "grad_norm": 0.3372192084789276, + "learning_rate": 1e-06, + "loss": 0.3459, + "step": 1258 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3138102889060974, + "epoch": 3.3131578947368423, + "grad_norm": 0.008773678913712502, + "learning_rate": 1e-06, + "loss": 0.116, + "step": 1259 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.28731557726860046, + "epoch": 3.3157894736842106, + "grad_norm": 0.016135241836309433, + "learning_rate": 1e-06, + "loss": 0.2797, + "step": 1260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8639.0, + "completions/mean_length": 2694.34375, + "completions/mean_terminated_length": 597.729736328125, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.2984755337238312, + "epoch": 3.318421052631579, + "frac_reward_zero_std": 0.34375, + "grad_norm": 48.49712371826172, + "learning_rate": 1e-06, + "loss": 0.1345, + "num_tokens": 410128779.0, + "reward": 0.7372407913208008, + "reward_std": 0.21289455890655518, + "rewards/progression_diversity/mean": -0.04057057946920395, + "rewards/progression_diversity/std": 0.10670918226242065, + "rewards/symbolic_reward_accuracy/mean": 0.8203125, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.85986328125, + "rewards/symbolic_reward_partial_score/std": 0.32551804184913635, + "rewards/tag_count_reward/mean": -0.125, + "rewards/tag_count_reward/std": 0.3310423493385315, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.096588373184204, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 296.0, + "sampling/sampling_logp_difference/mean": 2.3006858825683594, + "step": 1261 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.203125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2700554430484772, + "epoch": 3.3210526315789473, + "grad_norm": 0.009636012837290764, + "learning_rate": 1e-06, + "loss": 0.3738, + "step": 1262 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.30288518965244293, + "epoch": 3.3236842105263156, + "grad_norm": 0.275381863117218, + "learning_rate": 1e-06, + "loss": 0.1203, + "step": 1263 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3005267381668091, + "epoch": 3.3263157894736843, + "grad_norm": 0.15126138925552368, + "learning_rate": 1e-06, + "loss": 0.1354, + "step": 1264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15602.0, + "completions/mean_length": 2717.74609375, + "completions/mean_terminated_length": 624.7162475585938, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.29306742548942566, + "epoch": 3.3289473684210527, + "frac_reward_zero_std": 0.125, + "grad_norm": 84.46516418457031, + "learning_rate": 1e-06, + "loss": 0.2436, + "num_tokens": 411934761.0, + "reward": 0.6926791071891785, + "reward_std": 0.26792874932289124, + "rewards/progression_diversity/mean": -0.03872973471879959, + "rewards/progression_diversity/std": 0.10431662201881409, + "rewards/symbolic_reward_accuracy/mean": 0.755859375, + "rewards/symbolic_reward_accuracy/std": 0.42999663949012756, + "rewards/symbolic_reward_partial_score/mean": 0.8388671875, + "rewards/symbolic_reward_partial_score/std": 0.327902227640152, + "rewards/tag_count_reward/mean": -0.12109375, + "rewards/tag_count_reward/std": 0.3265552520751953, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.09934663772583, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 298.0, + "sampling/sampling_logp_difference/mean": 3.240568161010742, + "step": 1265 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.28837116062641144, + "epoch": 3.331578947368421, + "grad_norm": 42.86471939086914, + "learning_rate": 1e-06, + "loss": 0.2322, + "step": 1266 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.3078329712152481, + "epoch": 3.3342105263157893, + "grad_norm": 0.009662003256380558, + "learning_rate": 1e-06, + "loss": 0.1655, + "step": 1267 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5703125, + "entropy": 0.2638581395149231, + "epoch": 3.336842105263158, + "grad_norm": 0.007266658823937178, + "learning_rate": 1e-06, + "loss": 0.2735, + "step": 1268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2300.0, + "completions/mean_length": 2193.158203125, + "completions/mean_terminated_length": 554.5642700195312, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.3069176971912384, + "epoch": 3.3394736842105264, + "frac_reward_zero_std": 0.28125, + "grad_norm": 140.47747802734375, + "learning_rate": 1e-06, + "loss": 0.1502, + "num_tokens": 413436602.0, + "reward": 0.7662738561630249, + "reward_std": 0.20352765917778015, + "rewards/progression_diversity/mean": -0.032773204147815704, + "rewards/progression_diversity/std": 0.09917779266834259, + "rewards/symbolic_reward_accuracy/mean": 0.84375, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.9016926884651184, + "rewards/symbolic_reward_partial_score/std": 0.27043241262435913, + "rewards/tag_count_reward/mean": -0.1015625, + "rewards/tag_count_reward/std": 0.30236753821372986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.093727946281433, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 298.0, + "sampling/sampling_logp_difference/mean": 3.0766854286193848, + "step": 1269 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.32548436522483826, + "epoch": 3.3421052631578947, + "grad_norm": 0.011535858735442162, + "learning_rate": 1e-06, + "loss": 0.1011, + "step": 1270 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2972973585128784, + "epoch": 3.344736842105263, + "grad_norm": 0.00575890950858593, + "learning_rate": 1e-06, + "loss": 0.2421, + "step": 1271 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.2989150583744049, + "epoch": 3.3473684210526318, + "grad_norm": 0.008553121238946915, + "learning_rate": 1e-06, + "loss": 0.2467, + "step": 1272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.087890625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1972.0, + "completions/mean_length": 1971.931640625, + "completions/mean_terminated_length": 583.1884155273438, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "entropy": 0.31215590238571167, + "epoch": 3.35, + "frac_reward_zero_std": 0.21875, + "grad_norm": 132.39788818359375, + "learning_rate": 1e-06, + "loss": 0.1477, + "num_tokens": 414865399.0, + "reward": 0.7922816276550293, + "reward_std": 0.2368476241827011, + "rewards/progression_diversity/mean": -0.029654610902071, + "rewards/progression_diversity/std": 0.09719457477331161, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.904296875, + "rewards/symbolic_reward_partial_score/std": 0.28047847747802734, + "rewards/tag_count_reward/mean": -0.072265625, + "rewards/tag_count_reward/std": 0.2591804563999176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0867669582366943, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 300.0, + "sampling/sampling_logp_difference/mean": 3.5754165649414062, + "step": 1273 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3091353476047516, + "epoch": 3.3526315789473684, + "grad_norm": 0.010651475749909878, + "learning_rate": 1e-06, + "loss": 0.1685, + "step": 1274 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.30523787438869476, + "epoch": 3.3552631578947367, + "grad_norm": 0.008251101709902287, + "learning_rate": 1e-06, + "loss": 0.2036, + "step": 1275 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.29036298394203186, + "epoch": 3.3578947368421055, + "grad_norm": 0.012122713960707188, + "learning_rate": 1e-06, + "loss": 0.2905, + "step": 1276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1526.0, + "completions/mean_length": 1901.666015625, + "completions/mean_terminated_length": 573.8614501953125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.30242083966732025, + "epoch": 3.360526315789474, + "frac_reward_zero_std": 0.25, + "grad_norm": 185.1011199951172, + "learning_rate": 1e-06, + "loss": 0.2087, + "num_tokens": 416246188.0, + "reward": 0.7942166328430176, + "reward_std": 0.22256335616111755, + "rewards/progression_diversity/mean": -0.026581626385450363, + "rewards/progression_diversity/std": 0.0894814282655716, + "rewards/symbolic_reward_accuracy/mean": 0.87890625, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.9178059697151184, + "rewards/symbolic_reward_partial_score/std": 0.25296953320503235, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0797239542007446, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 300.0, + "sampling/sampling_logp_difference/mean": 3.349595069885254, + "step": 1277 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.30199746787548065, + "epoch": 3.363157894736842, + "grad_norm": 0.02538270130753517, + "learning_rate": 1e-06, + "loss": 0.2414, + "step": 1278 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3231187015771866, + "epoch": 3.3657894736842104, + "grad_norm": 0.012012549676001072, + "learning_rate": 1e-06, + "loss": 0.1036, + "step": 1279 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3105398118495941, + "epoch": 3.3684210526315788, + "grad_norm": 0.013158765621483326, + "learning_rate": 1e-06, + "loss": 0.1455, + "step": 1280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2128.0, + "completions/mean_length": 2354.533203125, + "completions/mean_terminated_length": 596.99560546875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.30621521174907684, + "epoch": 3.3710526315789475, + "frac_reward_zero_std": 0.25, + "grad_norm": 140.82589721679688, + "learning_rate": 1e-06, + "loss": 0.1302, + "num_tokens": 417859869.0, + "reward": 0.7398780584335327, + "reward_std": 0.2521520256996155, + "rewards/progression_diversity/mean": -0.0356326699256897, + "rewards/progression_diversity/std": 0.10271129757165909, + "rewards/symbolic_reward_accuracy/mean": 0.81640625, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.8697916269302368, + "rewards/symbolic_reward_partial_score/std": 0.3091786503791809, + "rewards/tag_count_reward/mean": -0.10546875, + "rewards/tag_count_reward/std": 0.3074568510055542, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0742948055267334, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 302.0, + "sampling/sampling_logp_difference/mean": 3.784925699234009, + "step": 1281 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.29408055543899536, + "epoch": 3.373684210526316, + "grad_norm": 0.010642572306096554, + "learning_rate": 1e-06, + "loss": 0.1868, + "step": 1282 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.29353274405002594, + "epoch": 3.376315789473684, + "grad_norm": 0.01788030005991459, + "learning_rate": 1e-06, + "loss": 0.1809, + "step": 1283 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.2797112911939621, + "epoch": 3.3789473684210525, + "grad_norm": 0.006915436126291752, + "learning_rate": 1e-06, + "loss": 0.2527, + "step": 1284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16124.0, + "completions/mean_length": 1972.298828125, + "completions/mean_terminated_length": 617.3526000976562, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.33247339725494385, + "epoch": 3.3815789473684212, + "frac_reward_zero_std": 0.25, + "grad_norm": 348.1959228515625, + "learning_rate": 1e-06, + "loss": 0.0597, + "num_tokens": 419281174.0, + "reward": 0.793836772441864, + "reward_std": 0.20707328617572784, + "rewards/progression_diversity/mean": -0.030389215797185898, + "rewards/progression_diversity/std": 0.09979861974716187, + "rewards/symbolic_reward_accuracy/mean": 0.87890625, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.9153646230697632, + "rewards/symbolic_reward_partial_score/std": 0.25428155064582825, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.062885046005249, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 304.0, + "sampling/sampling_logp_difference/mean": 3.6513113975524902, + "step": 1285 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3000902533531189, + "epoch": 3.3842105263157896, + "grad_norm": 0.006365468725562096, + "learning_rate": 1e-06, + "loss": 0.1583, + "step": 1286 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.2907492071390152, + "epoch": 3.386842105263158, + "grad_norm": 0.013992332853376865, + "learning_rate": 1e-06, + "loss": 0.2211, + "step": 1287 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2959248870611191, + "epoch": 3.389473684210526, + "grad_norm": 0.006391443312168121, + "learning_rate": 1e-06, + "loss": 0.2083, + "step": 1288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9612.0, + "completions/mean_length": 2112.052734375, + "completions/mean_terminated_length": 635.6444091796875, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.3044814467430115, + "epoch": 3.3921052631578945, + "frac_reward_zero_std": 0.15625, + "grad_norm": 314.1883239746094, + "learning_rate": 1e-06, + "loss": 0.1179, + "num_tokens": 420783377.0, + "reward": 0.77393639087677, + "reward_std": 0.25700780749320984, + "rewards/progression_diversity/mean": -0.03312736004590988, + "rewards/progression_diversity/std": 0.10419656336307526, + "rewards/symbolic_reward_accuracy/mean": 0.857421875, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.8946939706802368, + "rewards/symbolic_reward_partial_score/std": 0.2826669216156006, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0506939888000488, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 304.0, + "sampling/sampling_logp_difference/mean": 5.086845397949219, + "step": 1289 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.27778656780719757, + "epoch": 3.3947368421052633, + "grad_norm": 0.013014139607548714, + "learning_rate": 1e-06, + "loss": 0.2738, + "step": 1290 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.2996399998664856, + "epoch": 3.3973684210526316, + "grad_norm": 0.06032712757587433, + "learning_rate": 1e-06, + "loss": 0.2089, + "step": 1291 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.30588841438293457, + "epoch": 3.4, + "grad_norm": 0.018283555284142494, + "learning_rate": 1e-06, + "loss": 0.1612, + "step": 1292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10146.0, + "completions/mean_length": 2171.923828125, + "completions/mean_terminated_length": 565.34130859375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.3020727336406708, + "epoch": 3.4026315789473682, + "frac_reward_zero_std": 0.25, + "grad_norm": 125.83609008789062, + "learning_rate": 1e-06, + "loss": 0.0761, + "num_tokens": 422257322.0, + "reward": 0.7886345386505127, + "reward_std": 0.19221815466880798, + "rewards/progression_diversity/mean": -0.03791151940822601, + "rewards/progression_diversity/std": 0.11339190602302551, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.9158528447151184, + "rewards/symbolic_reward_partial_score/std": 0.25923240184783936, + "rewards/tag_count_reward/mean": -0.083984375, + "rewards/tag_count_reward/std": 0.2776356339454651, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0494663715362549, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 304.0, + "sampling/sampling_logp_difference/mean": 4.9170942306518555, + "step": 1293 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.29434891045093536, + "epoch": 3.405263157894737, + "grad_norm": 0.01162765920162201, + "learning_rate": 1e-06, + "loss": 0.1834, + "step": 1294 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2950708270072937, + "epoch": 3.4078947368421053, + "grad_norm": 0.009900042787194252, + "learning_rate": 1e-06, + "loss": 0.2169, + "step": 1295 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.30467459559440613, + "epoch": 3.4105263157894736, + "grad_norm": 0.009985024109482765, + "learning_rate": 1e-06, + "loss": 0.1849, + "step": 1296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15928.0, + "completions/mean_length": 3336.40625, + "completions/mean_terminated_length": 628.4151000976562, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.2799158841371536, + "epoch": 3.413157894736842, + "frac_reward_zero_std": 0.125, + "grad_norm": 283.4808654785156, + "learning_rate": 1e-06, + "loss": 0.166, + "num_tokens": 424360346.0, + "reward": 0.7545630931854248, + "reward_std": 0.22717466950416565, + "rewards/progression_diversity/mean": -0.06615223735570908, + "rewards/progression_diversity/std": 0.14478321373462677, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.87744140625, + "rewards/symbolic_reward_partial_score/std": 0.3093447685241699, + "rewards/tag_count_reward/mean": -0.119140625, + "rewards/tag_count_reward/std": 0.32427072525024414, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0230858325958252, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 304.0, + "sampling/sampling_logp_difference/mean": 7.494105815887451, + "step": 1297 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.23919285833835602, + "epoch": 3.4157894736842107, + "grad_norm": 1.23762845993042, + "learning_rate": 1e-06, + "loss": 0.371, + "step": 1298 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.25987882912158966, + "epoch": 3.418421052631579, + "grad_norm": 0.5271911025047302, + "learning_rate": 1e-06, + "loss": 0.2374, + "step": 1299 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.26133809983730316, + "epoch": 3.4210526315789473, + "grad_norm": 0.11925064027309418, + "learning_rate": 1e-06, + "loss": 0.188, + "step": 1300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10431.0, + "completions/mean_length": 3059.662109375, + "completions/mean_terminated_length": 592.192138671875, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.2862105369567871, + "epoch": 3.4236842105263157, + "frac_reward_zero_std": 0.09375, + "grad_norm": 341.6363830566406, + "learning_rate": 1e-06, + "loss": 0.195, + "num_tokens": 426325869.0, + "reward": 0.7616207599639893, + "reward_std": 0.24166487157344818, + "rewards/progression_diversity/mean": -0.05862627178430557, + "rewards/progression_diversity/std": 0.13662515580654144, + "rewards/symbolic_reward_accuracy/mean": 0.845703125, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.8831380009651184, + "rewards/symbolic_reward_partial_score/std": 0.301278293132782, + "rewards/tag_count_reward/mean": -0.1015625, + "rewards/tag_count_reward/std": 0.30236753821372986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0274146795272827, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 304.0, + "sampling/sampling_logp_difference/mean": 7.100645542144775, + "step": 1301 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.27012303471565247, + "epoch": 3.4263157894736844, + "grad_norm": 3.0215957164764404, + "learning_rate": 1e-06, + "loss": 0.2341, + "step": 1302 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.26916660368442535, + "epoch": 3.4289473684210527, + "grad_norm": 37.49262619018555, + "learning_rate": 1e-06, + "loss": 0.1515, + "step": 1303 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.27728109061717987, + "epoch": 3.431578947368421, + "grad_norm": 6.821070194244385, + "learning_rate": 1e-06, + "loss": 0.2638, + "step": 1304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15417.0, + "completions/mean_length": 3132.240234375, + "completions/mean_terminated_length": 678.2106323242188, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.2735847979784012, + "epoch": 3.4342105263157894, + "frac_reward_zero_std": 0.125, + "grad_norm": 166.9344940185547, + "learning_rate": 1e-06, + "loss": 0.2523, + "num_tokens": 428342568.0, + "reward": 0.7301239967346191, + "reward_std": 0.27412816882133484, + "rewards/progression_diversity/mean": -0.05889313295483589, + "rewards/progression_diversity/std": 0.136581152677536, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.8640950918197632, + "rewards/symbolic_reward_partial_score/std": 0.31252816319465637, + "rewards/tag_count_reward/mean": -0.125, + "rewards/tag_count_reward/std": 0.3310423493385315, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0343992710113525, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 306.0, + "sampling/sampling_logp_difference/mean": 6.35936164855957, + "step": 1305 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.264299601316452, + "epoch": 3.4368421052631577, + "grad_norm": 14.73751449584961, + "learning_rate": 1e-06, + "loss": 0.1532, + "step": 1306 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.27560053765773773, + "epoch": 3.4394736842105265, + "grad_norm": 2.936858892440796, + "learning_rate": 1e-06, + "loss": 0.2459, + "step": 1307 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2791202515363693, + "epoch": 3.442105263157895, + "grad_norm": 0.5196275115013123, + "learning_rate": 1e-06, + "loss": 0.2228, + "step": 1308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.130859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2387.0, + "completions/mean_length": 2650.87890625, + "completions/mean_terminated_length": 583.1954956054688, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.30482953786849976, + "epoch": 3.444736842105263, + "frac_reward_zero_std": 0.03125, + "grad_norm": 386.9339294433594, + "learning_rate": 1e-06, + "loss": 0.1641, + "num_tokens": 430092778.0, + "reward": 0.7358328104019165, + "reward_std": 0.2448652684688568, + "rewards/progression_diversity/mean": -0.04465062916278839, + "rewards/progression_diversity/std": 0.11796204745769501, + "rewards/symbolic_reward_accuracy/mean": 0.806640625, + "rewards/symbolic_reward_accuracy/std": 0.39531853795051575, + "rewards/symbolic_reward_partial_score/mean": 0.87548828125, + "rewards/symbolic_reward_partial_score/std": 0.299261212348938, + "rewards/tag_count_reward/mean": -0.103515625, + "rewards/tag_count_reward/std": 0.30492907762527466, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0414533615112305, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 304.0, + "sampling/sampling_logp_difference/mean": 5.4577789306640625, + "step": 1309 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.28392690420150757, + "epoch": 3.4473684210526314, + "grad_norm": 160.2227020263672, + "learning_rate": 1e-06, + "loss": 0.2389, + "step": 1310 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3085232526063919, + "epoch": 3.45, + "grad_norm": 0.17238354682922363, + "learning_rate": 1e-06, + "loss": 0.1332, + "step": 1311 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3984375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.53125, + "entropy": 0.2967042475938797, + "epoch": 3.4526315789473685, + "grad_norm": 7.176582336425781, + "learning_rate": 1e-06, + "loss": 0.1358, + "step": 1312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9963.0, + "completions/mean_length": 2258.275390625, + "completions/mean_terminated_length": 627.200439453125, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.31084753572940826, + "epoch": 3.455263157894737, + "frac_reward_zero_std": 0.25, + "grad_norm": 914.4140625, + "learning_rate": 1e-06, + "loss": 0.1569, + "num_tokens": 431662167.0, + "reward": 0.7634491920471191, + "reward_std": 0.23263157904148102, + "rewards/progression_diversity/mean": -0.03691425174474716, + "rewards/progression_diversity/std": 0.10997199267148972, + "rewards/symbolic_reward_accuracy/mean": 0.841796875, + "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, + "rewards/symbolic_reward_partial_score/mean": 0.8917642831802368, + "rewards/symbolic_reward_partial_score/std": 0.28572434186935425, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0521239042282104, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 4.490725994110107, + "step": 1313 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3135823905467987, + "epoch": 3.457894736842105, + "grad_norm": 48.58332824707031, + "learning_rate": 1e-06, + "loss": 0.1558, + "step": 1314 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.32143357396125793, + "epoch": 3.4605263157894735, + "grad_norm": 0.030546952039003372, + "learning_rate": 1e-06, + "loss": 0.1197, + "step": 1315 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.29036542773246765, + "epoch": 3.463157894736842, + "grad_norm": 0.008445881307125092, + "learning_rate": 1e-06, + "loss": 0.2007, + "step": 1316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2140.0, + "completions/mean_length": 2460.689453125, + "completions/mean_terminated_length": 612.46240234375, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.29718485474586487, + "epoch": 3.4657894736842105, + "frac_reward_zero_std": 0.15625, + "grad_norm": 533.7861938476562, + "learning_rate": 1e-06, + "loss": 0.1662, + "num_tokens": 433313176.0, + "reward": 0.7628533840179443, + "reward_std": 0.2578679919242859, + "rewards/progression_diversity/mean": -0.04279157519340515, + "rewards/progression_diversity/std": 0.11989957839250565, + "rewards/symbolic_reward_accuracy/mean": 0.84375, + "rewards/symbolic_reward_accuracy/std": 0.36344730854034424, + "rewards/symbolic_reward_partial_score/mean": 0.8880208730697632, + "rewards/symbolic_reward_partial_score/std": 0.28819218277931213, + "rewards/tag_count_reward/mean": -0.09375, + "rewards/tag_count_reward/std": 0.29176566004753113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0420756340026855, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 5.4957427978515625, + "step": 1317 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3101320266723633, + "epoch": 3.468421052631579, + "grad_norm": 0.01260958332568407, + "learning_rate": 1e-06, + "loss": 0.1788, + "step": 1318 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.27350659668445587, + "epoch": 3.4710526315789476, + "grad_norm": 3.119270086288452, + "learning_rate": 1e-06, + "loss": 0.3215, + "step": 1319 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.40625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.3225550502538681, + "epoch": 3.473684210526316, + "grad_norm": 2.4085659980773926, + "learning_rate": 1e-06, + "loss": 0.1297, + "step": 1320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16143.0, + "completions/mean_length": 2217.62109375, + "completions/mean_terminated_length": 650.4078369140625, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.3091781437397003, + "epoch": 3.4763157894736842, + "frac_reward_zero_std": 0.25, + "grad_norm": 291.2024230957031, + "learning_rate": 1e-06, + "loss": 0.1649, + "num_tokens": 434844726.0, + "reward": 0.7853677272796631, + "reward_std": 0.206242173910141, + "rewards/progression_diversity/mean": -0.03744862973690033, + "rewards/progression_diversity/std": 0.11320951581001282, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.9075521230697632, + "rewards/symbolic_reward_partial_score/std": 0.2654723823070526, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.050972580909729, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 4.5054779052734375, + "step": 1321 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.30050645768642426, + "epoch": 3.4789473684210526, + "grad_norm": 0.1077585518360138, + "learning_rate": 1e-06, + "loss": 0.2148, + "step": 1322 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.3242476284503937, + "epoch": 3.481578947368421, + "grad_norm": 0.015654178336262703, + "learning_rate": 1e-06, + "loss": 0.1183, + "step": 1323 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3109499663114548, + "epoch": 3.4842105263157896, + "grad_norm": 0.02455841936171055, + "learning_rate": 1e-06, + "loss": 0.1862, + "step": 1324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.158203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8314.0, + "completions/mean_length": 3150.29296875, + "completions/mean_terminated_length": 663.2157592773438, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.27029645442962646, + "epoch": 3.486842105263158, + "frac_reward_zero_std": 0.125, + "grad_norm": 303.6960144042969, + "learning_rate": 1e-06, + "loss": 0.1961, + "num_tokens": 436880172.0, + "reward": 0.7058703899383545, + "reward_std": 0.2576483488082886, + "rewards/progression_diversity/mean": -0.057497672736644745, + "rewards/progression_diversity/std": 0.13345807790756226, + "rewards/symbolic_reward_accuracy/mean": 0.76953125, + "rewards/symbolic_reward_accuracy/std": 0.42154473066329956, + "rewards/symbolic_reward_partial_score/mean": 0.8619791269302368, + "rewards/symbolic_reward_partial_score/std": 0.3079785406589508, + "rewards/tag_count_reward/mean": -0.138671875, + "rewards/tag_count_reward/std": 0.34594178199768066, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0368916988372803, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 5.3984527587890625, + "step": 1325 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.2876896560192108, + "epoch": 3.4894736842105263, + "grad_norm": 0.016265718266367912, + "learning_rate": 1e-06, + "loss": 0.1761, + "step": 1326 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.27050982415676117, + "epoch": 3.4921052631578946, + "grad_norm": 1.3987834453582764, + "learning_rate": 1e-06, + "loss": 0.269, + "step": 1327 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.28464990854263306, + "epoch": 3.4947368421052634, + "grad_norm": 32.02823257446289, + "learning_rate": 1e-06, + "loss": 0.1709, + "step": 1328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10308.0, + "completions/mean_length": 1944.478515625, + "completions/mean_terminated_length": 654.1383056640625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.2990902215242386, + "epoch": 3.4973684210526317, + "frac_reward_zero_std": 0.1875, + "grad_norm": 411.1249694824219, + "learning_rate": 1e-06, + "loss": 0.2033, + "num_tokens": 438276225.0, + "reward": 0.7881462574005127, + "reward_std": 0.22861242294311523, + "rewards/progression_diversity/mean": -0.03303021937608719, + "rewards/progression_diversity/std": 0.1093946024775505, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.9140625, + "rewards/symbolic_reward_partial_score/std": 0.2569299340248108, + "rewards/tag_count_reward/mean": -0.083984375, + "rewards/tag_count_reward/std": 0.2776356339454651, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.038718819618225, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 5.05161190032959, + "step": 1329 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3089918792247772, + "epoch": 3.5, + "grad_norm": 0.021115725859999657, + "learning_rate": 1e-06, + "loss": 0.1788, + "step": 1330 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3167189657688141, + "epoch": 3.5026315789473683, + "grad_norm": 0.010390168987214565, + "learning_rate": 1e-06, + "loss": 0.1894, + "step": 1331 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.33414115011692047, + "epoch": 3.5052631578947366, + "grad_norm": 0.01447315514087677, + "learning_rate": 1e-06, + "loss": 0.0922, + "step": 1332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.12109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3667.0, + "completions/mean_length": 2530.16015625, + "completions/mean_terminated_length": 621.4088745117188, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.2866598814725876, + "epoch": 3.5078947368421054, + "frac_reward_zero_std": 0.15625, + "grad_norm": 1008.520751953125, + "learning_rate": 1e-06, + "loss": 0.2519, + "num_tokens": 439965971.0, + "reward": 0.7736403942108154, + "reward_std": 0.23456132411956787, + "rewards/progression_diversity/mean": -0.0480697825551033, + "rewards/progression_diversity/std": 0.13115154206752777, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.8948568105697632, + "rewards/symbolic_reward_partial_score/std": 0.2902930974960327, + "rewards/tag_count_reward/mean": -0.111328125, + "rewards/tag_count_reward/std": 0.31484565138816833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0280325412750244, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 5.589191436767578, + "step": 1333 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.30402611196041107, + "epoch": 3.5105263157894737, + "grad_norm": 1195.4456787109375, + "learning_rate": 1e-06, + "loss": 0.2695, + "step": 1334 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.29913991689682007, + "epoch": 3.513157894736842, + "grad_norm": 0.3809571862220764, + "learning_rate": 1e-06, + "loss": 0.1482, + "step": 1335 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.31326740980148315, + "epoch": 3.515789473684211, + "grad_norm": 1.5359933376312256, + "learning_rate": 1e-06, + "loss": 0.1986, + "step": 1336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10774.0, + "completions/mean_length": 2040.724609375, + "completions/mean_terminated_length": 624.8648071289062, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.3067469596862793, + "epoch": 3.518421052631579, + "frac_reward_zero_std": 0.34375, + "grad_norm": 130.65467834472656, + "learning_rate": 1e-06, + "loss": 0.1706, + "num_tokens": 441414950.0, + "reward": 0.7661923170089722, + "reward_std": 0.18958313763141632, + "rewards/progression_diversity/mean": -0.03604454547166824, + "rewards/progression_diversity/std": 0.11484090983867645, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.9021810293197632, + "rewards/symbolic_reward_partial_score/std": 0.2684669494628906, + "rewards/tag_count_reward/mean": -0.080078125, + "rewards/tag_count_reward/std": 0.271679550409317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0413990020751953, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 4.274114608764648, + "step": 1337 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3064485937356949, + "epoch": 3.5210526315789474, + "grad_norm": 4.137460708618164, + "learning_rate": 1e-06, + "loss": 0.1474, + "step": 1338 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.30475564301013947, + "epoch": 3.5236842105263158, + "grad_norm": 0.25504910945892334, + "learning_rate": 1e-06, + "loss": 0.1556, + "step": 1339 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3473477065563202, + "epoch": 3.526315789473684, + "grad_norm": 0.14421966671943665, + "learning_rate": 1e-06, + "loss": 0.076, + "step": 1340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.119140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15636.0, + "completions/mean_length": 2503.556640625, + "completions/mean_terminated_length": 626.1574096679688, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.3233236074447632, + "epoch": 3.5289473684210524, + "frac_reward_zero_std": 0.21875, + "grad_norm": 764.384765625, + "learning_rate": 1e-06, + "loss": 0.1338, + "num_tokens": 443064707.0, + "reward": 0.7563366889953613, + "reward_std": 0.2102624773979187, + "rewards/progression_diversity/mean": -0.04993094503879547, + "rewards/progression_diversity/std": 0.1373254358768463, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.8860677480697632, + "rewards/symbolic_reward_partial_score/std": 0.28898024559020996, + "rewards/tag_count_reward/mean": -0.10546875, + "rewards/tag_count_reward/std": 0.3074568510055542, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0196648836135864, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 6.12931489944458, + "step": 1341 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2928446978330612, + "epoch": 3.531578947368421, + "grad_norm": 0.5019714832305908, + "learning_rate": 1e-06, + "loss": 0.224, + "step": 1342 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3110320568084717, + "epoch": 3.5342105263157895, + "grad_norm": 0.13324137032032013, + "learning_rate": 1e-06, + "loss": 0.1591, + "step": 1343 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.31039758026599884, + "epoch": 3.536842105263158, + "grad_norm": 0.014469148591160774, + "learning_rate": 1e-06, + "loss": 0.2039, + "step": 1344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2137.0, + "completions/mean_length": 2396.134765625, + "completions/mean_terminated_length": 643.808837890625, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.33016908168792725, + "epoch": 3.5394736842105265, + "frac_reward_zero_std": 0.25, + "grad_norm": 333.5965576171875, + "learning_rate": 1e-06, + "loss": 0.108, + "num_tokens": 444689480.0, + "reward": 0.7707592844963074, + "reward_std": 0.21156515181064606, + "rewards/progression_diversity/mean": -0.04321339726448059, + "rewards/progression_diversity/std": 0.1244661882519722, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.9091796875, + "rewards/symbolic_reward_partial_score/std": 0.25746801495552063, + "rewards/tag_count_reward/mean": -0.1015625, + "rewards/tag_count_reward/std": 0.30236753821372986, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.030545949935913, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 5.220053672790527, + "step": 1345 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3137245178222656, + "epoch": 3.542105263157895, + "grad_norm": 0.03173833340406418, + "learning_rate": 1e-06, + "loss": 0.1528, + "step": 1346 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3061438798904419, + "epoch": 3.544736842105263, + "grad_norm": 0.012165974825620651, + "learning_rate": 1e-06, + "loss": 0.1486, + "step": 1347 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.3039957582950592, + "epoch": 3.5473684210526315, + "grad_norm": 0.023023858666419983, + "learning_rate": 1e-06, + "loss": 0.2288, + "step": 1348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.115234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9747.0, + "completions/mean_length": 2442.896484375, + "completions/mean_terminated_length": 627.1677856445312, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.31957219541072845, + "epoch": 3.55, + "frac_reward_zero_std": 0.21875, + "grad_norm": 117.19609069824219, + "learning_rate": 1e-06, + "loss": 0.1076, + "num_tokens": 446334707.0, + "reward": 0.7665076851844788, + "reward_std": 0.2067509889602661, + "rewards/progression_diversity/mean": -0.04357065260410309, + "rewards/progression_diversity/std": 0.12135568261146545, + "rewards/symbolic_reward_accuracy/mean": 0.845703125, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.90087890625, + "rewards/symbolic_reward_partial_score/std": 0.2657473385334015, + "rewards/tag_count_reward/mean": -0.107421875, + "rewards/tag_count_reward/std": 0.30995169281959534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0276845693588257, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 5.852321624755859, + "step": 1349 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.30125513672828674, + "epoch": 3.5526315789473686, + "grad_norm": 0.008969382382929325, + "learning_rate": 1e-06, + "loss": 0.2238, + "step": 1350 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.300917848944664, + "epoch": 3.555263157894737, + "grad_norm": 0.008844579569995403, + "learning_rate": 1e-06, + "loss": 0.1436, + "step": 1351 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.28367167711257935, + "epoch": 3.557894736842105, + "grad_norm": 0.01489369384944439, + "learning_rate": 1e-06, + "loss": 0.2285, + "step": 1352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.138671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15503.0, + "completions/mean_length": 2874.439453125, + "completions/mean_terminated_length": 699.4308471679688, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.29677602648735046, + "epoch": 3.5605263157894735, + "frac_reward_zero_std": 0.0625, + "grad_norm": 187.5312957763672, + "learning_rate": 1e-06, + "loss": 0.1843, + "num_tokens": 448214740.0, + "reward": 0.7094203233718872, + "reward_std": 0.2542072832584381, + "rewards/progression_diversity/mean": -0.04918673634529114, + "rewards/progression_diversity/std": 0.1233910620212555, + "rewards/symbolic_reward_accuracy/mean": 0.783203125, + "rewards/symbolic_reward_accuracy/std": 0.4124660789966583, + "rewards/symbolic_reward_partial_score/mean": 0.8396809697151184, + "rewards/symbolic_reward_partial_score/std": 0.3393942713737488, + "rewards/tag_count_reward/mean": -0.119140625, + "rewards/tag_count_reward/std": 0.32427072525024414, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0222926139831543, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 308.0, + "sampling/sampling_logp_difference/mean": 6.464677810668945, + "step": 1353 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.30209778249263763, + "epoch": 3.5631578947368423, + "grad_norm": 0.01030214224010706, + "learning_rate": 1e-06, + "loss": 0.1664, + "step": 1354 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.2921713888645172, + "epoch": 3.5657894736842106, + "grad_norm": 0.3698064982891083, + "learning_rate": 1e-06, + "loss": 0.2283, + "step": 1355 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.453125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.6171875, + "entropy": 0.2947758883237839, + "epoch": 3.568421052631579, + "grad_norm": 0.0077121201902627945, + "learning_rate": 1e-06, + "loss": 0.2436, + "step": 1356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10491.0, + "completions/mean_length": 2377.16796875, + "completions/mean_terminated_length": 622.4659423828125, + "completions/min_length": 229.0, + "completions/min_terminated_length": 229.0, + "entropy": 0.30925987660884857, + "epoch": 3.5710526315789473, + "frac_reward_zero_std": 0.15625, + "grad_norm": 539.5180053710938, + "learning_rate": 1e-06, + "loss": 0.1286, + "num_tokens": 449834314.0, + "reward": 0.7453286647796631, + "reward_std": 0.21328219771385193, + "rewards/progression_diversity/mean": -0.03745080530643463, + "rewards/progression_diversity/std": 0.10799506306648254, + "rewards/symbolic_reward_accuracy/mean": 0.822265625, + "rewards/symbolic_reward_accuracy/std": 0.3826628625392914, + "rewards/symbolic_reward_partial_score/mean": 0.8756510019302368, + "rewards/symbolic_reward_partial_score/std": 0.29454636573791504, + "rewards/tag_count_reward/mean": -0.103515625, + "rewards/tag_count_reward/std": 0.30492907762527466, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.033726692199707, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 312.0, + "sampling/sampling_logp_difference/mean": 5.676729202270508, + "step": 1357 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.31313398480415344, + "epoch": 3.5736842105263156, + "grad_norm": 0.014206201769411564, + "learning_rate": 1e-06, + "loss": 0.1543, + "step": 1358 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3828125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2981453835964203, + "epoch": 3.5763157894736843, + "grad_norm": 0.24512311816215515, + "learning_rate": 1e-06, + "loss": 0.2296, + "step": 1359 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.3176209479570389, + "epoch": 3.5789473684210527, + "grad_norm": 0.02592810057103634, + "learning_rate": 1e-06, + "loss": 0.2541, + "step": 1360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14099.0, + "completions/mean_length": 2200.619140625, + "completions/mean_terminated_length": 631.5249633789062, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.320136159658432, + "epoch": 3.581578947368421, + "frac_reward_zero_std": 0.28125, + "grad_norm": 498.01348876953125, + "learning_rate": 1e-06, + "loss": 0.1276, + "num_tokens": 451356839.0, + "reward": 0.7508112192153931, + "reward_std": 0.2062763273715973, + "rewards/progression_diversity/mean": -0.031185226514935493, + "rewards/progression_diversity/std": 0.09589832276105881, + "rewards/symbolic_reward_accuracy/mean": 0.82421875, + "rewards/symbolic_reward_accuracy/std": 0.3810062110424042, + "rewards/symbolic_reward_partial_score/mean": 0.8839517831802368, + "rewards/symbolic_reward_partial_score/std": 0.2886301875114441, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0460567474365234, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 312.0, + "sampling/sampling_logp_difference/mean": 4.3595733642578125, + "step": 1361 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3112920671701431, + "epoch": 3.5842105263157897, + "grad_norm": 0.022203197702765465, + "learning_rate": 1e-06, + "loss": 0.218, + "step": 1362 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.31792403757572174, + "epoch": 3.586842105263158, + "grad_norm": 0.020310793071985245, + "learning_rate": 1e-06, + "loss": 0.1697, + "step": 1363 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.3065248429775238, + "epoch": 3.5894736842105264, + "grad_norm": 0.00812375359237194, + "learning_rate": 1e-06, + "loss": 0.1274, + "step": 1364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.138671875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11008.0, + "completions/mean_length": 2851.1796875, + "completions/mean_terminated_length": 672.4263305664062, + "completions/min_length": 232.0, + "completions/min_terminated_length": 232.0, + "entropy": 0.2910042405128479, + "epoch": 3.5921052631578947, + "frac_reward_zero_std": 0.15625, + "grad_norm": 208.19349670410156, + "learning_rate": 1e-06, + "loss": 0.2613, + "num_tokens": 453229795.0, + "reward": 0.699255645275116, + "reward_std": 0.22771009802818298, + "rewards/progression_diversity/mean": -0.04025688022375107, + "rewards/progression_diversity/std": 0.10407532006502151, + "rewards/symbolic_reward_accuracy/mean": 0.76171875, + "rewards/symbolic_reward_accuracy/std": 0.42644867300987244, + "rewards/symbolic_reward_partial_score/mean": 0.8439127802848816, + "rewards/symbolic_reward_partial_score/std": 0.31969720125198364, + "rewards/tag_count_reward/mean": -0.10546875, + "rewards/tag_count_reward/std": 0.3074568510055542, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0359622240066528, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 312.0, + "sampling/sampling_logp_difference/mean": 5.4447712898254395, + "step": 1365 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2833098769187927, + "epoch": 3.594736842105263, + "grad_norm": 1.1845999956130981, + "learning_rate": 1e-06, + "loss": 0.1663, + "step": 1366 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2765520215034485, + "epoch": 3.5973684210526313, + "grad_norm": 4.952581882476807, + "learning_rate": 1e-06, + "loss": 0.2748, + "step": 1367 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.2997213453054428, + "epoch": 3.6, + "grad_norm": 0.008390809409320354, + "learning_rate": 1e-06, + "loss": 0.1059, + "step": 1368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.150390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2273.0, + "completions/mean_length": 2958.927734375, + "completions/mean_terminated_length": 582.53564453125, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.2849253863096237, + "epoch": 3.6026315789473684, + "frac_reward_zero_std": 0.125, + "grad_norm": 119.27400207519531, + "learning_rate": 1e-06, + "loss": 0.1874, + "num_tokens": 455127902.0, + "reward": 0.7533248662948608, + "reward_std": 0.24902209639549255, + "rewards/progression_diversity/mean": -0.048372894525527954, + "rewards/progression_diversity/std": 0.11722088605165482, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.8759765625, + "rewards/symbolic_reward_partial_score/std": 0.3028711676597595, + "rewards/tag_count_reward/mean": -0.09375, + "rewards/tag_count_reward/std": 0.29176566004753113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.029304027557373, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 312.0, + "sampling/sampling_logp_difference/mean": 6.3514299392700195, + "step": 1369 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.29380667209625244, + "epoch": 3.6052631578947367, + "grad_norm": 4.995346546173096, + "learning_rate": 1e-06, + "loss": 0.209, + "step": 1370 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.2862599343061447, + "epoch": 3.6078947368421055, + "grad_norm": 794.2086181640625, + "learning_rate": 1e-06, + "loss": 0.2462, + "step": 1371 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.26471564173698425, + "epoch": 3.610526315789474, + "grad_norm": 0.02694852091372013, + "learning_rate": 1e-06, + "loss": 0.2141, + "step": 1372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.18359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2655.0, + "completions/mean_length": 3480.74609375, + "completions/mean_terminated_length": 579.057373046875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.29130323231220245, + "epoch": 3.613157894736842, + "frac_reward_zero_std": 0.0625, + "grad_norm": 67.4537353515625, + "learning_rate": 1e-06, + "loss": 0.1288, + "num_tokens": 457293500.0, + "reward": 0.7533182501792908, + "reward_std": 0.19987264275550842, + "rewards/progression_diversity/mean": -0.0636826902627945, + "rewards/progression_diversity/std": 0.13695798814296722, + "rewards/symbolic_reward_accuracy/mean": 0.83203125, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.8790690302848816, + "rewards/symbolic_reward_partial_score/std": 0.2996877133846283, + "rewards/tag_count_reward/mean": -0.08984375, + "rewards/tag_count_reward/std": 0.2862374484539032, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0184675455093384, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 312.0, + "sampling/sampling_logp_difference/mean": 7.7314229011535645, + "step": 1373 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.24614471197128296, + "epoch": 3.6157894736842104, + "grad_norm": 132.7631378173828, + "learning_rate": 1e-06, + "loss": 0.2735, + "step": 1374 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1953125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.23733964562416077, + "epoch": 3.6184210526315788, + "grad_norm": 0.007347916718572378, + "learning_rate": 1e-06, + "loss": 0.2669, + "step": 1375 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4140625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.262113094329834, + "epoch": 3.6210526315789475, + "grad_norm": 0.00864808913320303, + "learning_rate": 1e-06, + "loss": 0.2765, + "step": 1376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2650.0, + "completions/mean_length": 2676.58984375, + "completions/mean_terminated_length": 577.2567749023438, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "entropy": 0.2994058430194855, + "epoch": 3.623684210526316, + "frac_reward_zero_std": 0.1875, + "grad_norm": 173.56097412109375, + "learning_rate": 1e-06, + "loss": 0.1616, + "num_tokens": 459061546.0, + "reward": 0.8030776977539062, + "reward_std": 0.17727278172969818, + "rewards/progression_diversity/mean": -0.04867924749851227, + "rewards/progression_diversity/std": 0.1260039210319519, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.9161783456802368, + "rewards/symbolic_reward_partial_score/std": 0.2622043490409851, + "rewards/tag_count_reward/mean": -0.068359375, + "rewards/tag_count_reward/std": 0.25260838866233826, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0174307823181152, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 312.0, + "sampling/sampling_logp_difference/mean": 7.301526069641113, + "step": 1377 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2790711969137192, + "epoch": 3.626315789473684, + "grad_norm": 6.741792678833008, + "learning_rate": 1e-06, + "loss": 0.3263, + "step": 1378 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.359375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.28607361018657684, + "epoch": 3.6289473684210525, + "grad_norm": 0.06569147855043411, + "learning_rate": 1e-06, + "loss": 0.1445, + "step": 1379 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2860715389251709, + "epoch": 3.6315789473684212, + "grad_norm": 0.07871459424495697, + "learning_rate": 1e-06, + "loss": 0.1596, + "step": 1380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9562.0, + "completions/mean_length": 2981.08984375, + "completions/mean_terminated_length": 644.802734375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.28075069189071655, + "epoch": 3.6342105263157896, + "frac_reward_zero_std": 0.1875, + "grad_norm": 278.2060546875, + "learning_rate": 1e-06, + "loss": 0.1905, + "num_tokens": 460993176.0, + "reward": 0.6964207887649536, + "reward_std": 0.240301251411438, + "rewards/progression_diversity/mean": -0.0503075085580349, + "rewards/progression_diversity/std": 0.12393683940172195, + "rewards/symbolic_reward_accuracy/mean": 0.767578125, + "rewards/symbolic_reward_accuracy/std": 0.42278963327407837, + "rewards/symbolic_reward_partial_score/mean": 0.82958984375, + "rewards/symbolic_reward_partial_score/std": 0.34416452050209045, + "rewards/tag_count_reward/mean": -0.125, + "rewards/tag_count_reward/std": 0.3310423493385315, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0247937440872192, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 314.0, + "sampling/sampling_logp_difference/mean": 6.386007308959961, + "step": 1381 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.28298284113407135, + "epoch": 3.636842105263158, + "grad_norm": 0.013535849750041962, + "learning_rate": 1e-06, + "loss": 0.2295, + "step": 1382 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5078125, + "entropy": 0.2968677282333374, + "epoch": 3.639473684210526, + "grad_norm": 0.01233405340462923, + "learning_rate": 1e-06, + "loss": 0.1708, + "step": 1383 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2959662228822708, + "epoch": 3.6421052631578945, + "grad_norm": 0.6275126934051514, + "learning_rate": 1e-06, + "loss": 0.1565, + "step": 1384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.126953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12655.0, + "completions/mean_length": 2703.05078125, + "completions/mean_terminated_length": 713.6510009765625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.3054865449666977, + "epoch": 3.6447368421052633, + "frac_reward_zero_std": 0.03125, + "grad_norm": 223.1032257080078, + "learning_rate": 1e-06, + "loss": 0.1583, + "num_tokens": 462803986.0, + "reward": 0.7047585844993591, + "reward_std": 0.28779661655426025, + "rewards/progression_diversity/mean": -0.046601302921772, + "rewards/progression_diversity/std": 0.12478232383728027, + "rewards/symbolic_reward_accuracy/mean": 0.771484375, + "rewards/symbolic_reward_accuracy/std": 0.4202871024608612, + "rewards/symbolic_reward_partial_score/mean": 0.8448892831802368, + "rewards/symbolic_reward_partial_score/std": 0.32900720834732056, + "rewards/tag_count_reward/mean": -0.111328125, + "rewards/tag_count_reward/std": 0.31484565138816833, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0184571743011475, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 316.0, + "sampling/sampling_logp_difference/mean": 7.270891189575195, + "step": 1385 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.29785533249378204, + "epoch": 3.6473684210526316, + "grad_norm": 0.015050256624817848, + "learning_rate": 1e-06, + "loss": 0.229, + "step": 1386 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3828125, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.2764730453491211, + "epoch": 3.65, + "grad_norm": 0.25503814220428467, + "learning_rate": 1e-06, + "loss": 0.2498, + "step": 1387 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.290697917342186, + "epoch": 3.6526315789473687, + "grad_norm": 4.086553573608398, + "learning_rate": 1e-06, + "loss": 0.2406, + "step": 1388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3594.0, + "completions/mean_length": 2599.453125, + "completions/mean_terminated_length": 630.232177734375, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.3037455379962921, + "epoch": 3.655263157894737, + "frac_reward_zero_std": 0.0625, + "grad_norm": 311.0882873535156, + "learning_rate": 1e-06, + "loss": 0.1859, + "num_tokens": 464557082.0, + "reward": 0.7203487157821655, + "reward_std": 0.24300315976142883, + "rewards/progression_diversity/mean": -0.04520959407091141, + "rewards/progression_diversity/std": 0.12297887355089188, + "rewards/symbolic_reward_accuracy/mean": 0.7890625, + "rewards/symbolic_reward_accuracy/std": 0.4083731174468994, + "rewards/symbolic_reward_partial_score/mean": 0.8623046875, + "rewards/symbolic_reward_partial_score/std": 0.31785058975219727, + "rewards/tag_count_reward/mean": -0.11328125, + "rewards/tag_count_reward/std": 0.3172462284564972, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0264785289764404, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 318.0, + "sampling/sampling_logp_difference/mean": 6.508614540100098, + "step": 1389 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4453125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5625, + "entropy": 0.30377230048179626, + "epoch": 3.6578947368421053, + "grad_norm": 0.015176265500485897, + "learning_rate": 1e-06, + "loss": 0.1212, + "step": 1390 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.421875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.578125, + "entropy": 0.3056159168481827, + "epoch": 3.6605263157894736, + "grad_norm": 0.1255595088005066, + "learning_rate": 1e-06, + "loss": 0.2117, + "step": 1391 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3828125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.546875, + "entropy": 0.29334986209869385, + "epoch": 3.663157894736842, + "grad_norm": 0.023698071017861366, + "learning_rate": 1e-06, + "loss": 0.2919, + "step": 1392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2280.0, + "completions/mean_length": 1780.5703125, + "completions/mean_terminated_length": 609.8311767578125, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.32029321789741516, + "epoch": 3.6657894736842103, + "frac_reward_zero_std": 0.375, + "grad_norm": 217.3709259033203, + "learning_rate": 1e-06, + "loss": 0.1288, + "num_tokens": 465858846.0, + "reward": 0.7777209281921387, + "reward_std": 0.20324808359146118, + "rewards/progression_diversity/mean": -0.025765735656023026, + "rewards/progression_diversity/std": 0.09416744112968445, + "rewards/symbolic_reward_accuracy/mean": 0.853515625, + "rewards/symbolic_reward_accuracy/std": 0.35393697023391724, + "rewards/symbolic_reward_partial_score/mean": 0.90771484375, + "rewards/symbolic_reward_partial_score/std": 0.2606476843357086, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0482451915740967, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 320.0, + "sampling/sampling_logp_difference/mean": 4.223204612731934, + "step": 1393 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.31482554972171783, + "epoch": 3.668421052631579, + "grad_norm": 2.6809914112091064, + "learning_rate": 1e-06, + "loss": 0.0911, + "step": 1394 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3331180214881897, + "epoch": 3.6710526315789473, + "grad_norm": 0.01906052976846695, + "learning_rate": 1e-06, + "loss": 0.1226, + "step": 1395 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.32584846019744873, + "epoch": 3.6736842105263157, + "grad_norm": 60.955745697021484, + "learning_rate": 1e-06, + "loss": 0.1089, + "step": 1396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2274.0, + "completions/mean_length": 2306.021484375, + "completions/mean_terminated_length": 646.1724853515625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.3163197785615921, + "epoch": 3.6763157894736844, + "frac_reward_zero_std": 0.28125, + "grad_norm": 166.03375244140625, + "learning_rate": 1e-06, + "loss": 0.1648, + "num_tokens": 467440009.0, + "reward": 0.7467856407165527, + "reward_std": 0.22565452754497528, + "rewards/progression_diversity/mean": -0.038237735629081726, + "rewards/progression_diversity/std": 0.11381914466619492, + "rewards/symbolic_reward_accuracy/mean": 0.8203125, + "rewards/symbolic_reward_accuracy/std": 0.38430243730545044, + "rewards/symbolic_reward_partial_score/mean": 0.8811848759651184, + "rewards/symbolic_reward_partial_score/std": 0.2920258939266205, + "rewards/tag_count_reward/mean": -0.09375, + "rewards/tag_count_reward/std": 0.29176566004753113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.042597770690918, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 320.0, + "sampling/sampling_logp_difference/mean": 5.3379316329956055, + "step": 1397 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.31748393177986145, + "epoch": 3.6789473684210527, + "grad_norm": 256.6326599121094, + "learning_rate": 1e-06, + "loss": 0.1745, + "step": 1398 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.30522364377975464, + "epoch": 3.681578947368421, + "grad_norm": 2.4348554611206055, + "learning_rate": 1e-06, + "loss": 0.1645, + "step": 1399 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.30944982171058655, + "epoch": 3.6842105263157894, + "grad_norm": 3.376948356628418, + "learning_rate": 1e-06, + "loss": 0.1815, + "step": 1400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2336.0, + "completions/mean_length": 1631.30078125, + "completions/mean_terminated_length": 581.9456176757812, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.3202086091041565, + "epoch": 3.6868421052631577, + "frac_reward_zero_std": 0.40625, + "grad_norm": 520.1287841796875, + "learning_rate": 1e-06, + "loss": 0.1837, + "num_tokens": 468665507.0, + "reward": 0.823440432548523, + "reward_std": 0.1710093915462494, + "rewards/progression_diversity/mean": -0.024127831682562828, + "rewards/progression_diversity/std": 0.09204965829849243, + "rewards/symbolic_reward_accuracy/mean": 0.916015625, + "rewards/symbolic_reward_accuracy/std": 0.2776356339454651, + "rewards/symbolic_reward_partial_score/mean": 0.93310546875, + "rewards/symbolic_reward_partial_score/std": 0.2400180548429489, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0514576435089111, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 322.0, + "sampling/sampling_logp_difference/mean": 4.249440670013428, + "step": 1401 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.33165352046489716, + "epoch": 3.6894736842105265, + "grad_norm": 0.17460408806800842, + "learning_rate": 1e-06, + "loss": 0.1594, + "step": 1402 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3367105722427368, + "epoch": 3.692105263157895, + "grad_norm": 0.006330076605081558, + "learning_rate": 1e-06, + "loss": 0.1336, + "step": 1403 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.3383978605270386, + "epoch": 3.694736842105263, + "grad_norm": 0.006822290364652872, + "learning_rate": 1e-06, + "loss": 0.0894, + "step": 1404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.111328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 2382.189453125, + "completions/mean_terminated_length": 628.1165161132812, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.30853356420993805, + "epoch": 3.6973684210526314, + "frac_reward_zero_std": 0.3125, + "grad_norm": 92.30716705322266, + "learning_rate": 1e-06, + "loss": 0.0998, + "num_tokens": 470292484.0, + "reward": 0.7492548823356628, + "reward_std": 0.22711637616157532, + "rewards/progression_diversity/mean": -0.04033348336815834, + "rewards/progression_diversity/std": 0.11694171279668808, + "rewards/symbolic_reward_accuracy/mean": 0.828125, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.8758138418197632, + "rewards/symbolic_reward_partial_score/std": 0.30300623178482056, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0440869331359863, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 324.0, + "sampling/sampling_logp_difference/mean": 4.415769100189209, + "step": 1405 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3130802512168884, + "epoch": 3.7, + "grad_norm": 4.948612689971924, + "learning_rate": 1e-06, + "loss": 0.1542, + "step": 1406 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2954626977443695, + "epoch": 3.7026315789473685, + "grad_norm": 214.0167694091797, + "learning_rate": 1e-06, + "loss": 0.2376, + "step": 1407 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.29514195024967194, + "epoch": 3.705263157894737, + "grad_norm": 0.019930332899093628, + "learning_rate": 1e-06, + "loss": 0.1884, + "step": 1408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2248.0, + "completions/mean_length": 1681.869140625, + "completions/mean_terminated_length": 569.9432983398438, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.3324284106492996, + "epoch": 3.707894736842105, + "frac_reward_zero_std": 0.21875, + "grad_norm": 933.0283203125, + "learning_rate": 1e-06, + "loss": 0.1402, + "num_tokens": 471541729.0, + "reward": 0.8024474382400513, + "reward_std": 0.21313899755477905, + "rewards/progression_diversity/mean": -0.028701424598693848, + "rewards/progression_diversity/std": 0.10557378083467484, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.9270833134651184, + "rewards/symbolic_reward_partial_score/std": 0.2357022613286972, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0388381481170654, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 324.0, + "sampling/sampling_logp_difference/mean": 4.987360954284668, + "step": 1409 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3176589757204056, + "epoch": 3.7105263157894735, + "grad_norm": 0.014979632571339607, + "learning_rate": 1e-06, + "loss": 0.174, + "step": 1410 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.32269425690174103, + "epoch": 3.713157894736842, + "grad_norm": 0.00700529245659709, + "learning_rate": 1e-06, + "loss": 0.1423, + "step": 1411 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3213765025138855, + "epoch": 3.7157894736842105, + "grad_norm": 0.2787642180919647, + "learning_rate": 1e-06, + "loss": 0.1043, + "step": 1412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3836.0, + "completions/mean_length": 2176.283203125, + "completions/mean_terminated_length": 638.6514892578125, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.31092438101768494, + "epoch": 3.718421052631579, + "frac_reward_zero_std": 0.1875, + "grad_norm": 331.7125244140625, + "learning_rate": 1e-06, + "loss": 0.171, + "num_tokens": 473063122.0, + "reward": 0.7549986839294434, + "reward_std": 0.18977715075016022, + "rewards/progression_diversity/mean": -0.037239447236061096, + "rewards/progression_diversity/std": 0.11695530265569687, + "rewards/symbolic_reward_accuracy/mean": 0.826171875, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.8948567509651184, + "rewards/symbolic_reward_partial_score/std": 0.2710212469100952, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0411852598190308, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 326.0, + "sampling/sampling_logp_difference/mean": 4.672489166259766, + "step": 1413 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.29592007398605347, + "epoch": 3.7210526315789476, + "grad_norm": 225.3509979248047, + "learning_rate": 1e-06, + "loss": 0.211, + "step": 1414 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4140625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.515625, + "entropy": 0.3040802776813507, + "epoch": 3.723684210526316, + "grad_norm": 0.005704566836357117, + "learning_rate": 1e-06, + "loss": 0.1706, + "step": 1415 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.4140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.484375, + "entropy": 0.33103369176387787, + "epoch": 3.7263157894736842, + "grad_norm": 0.011854211799800396, + "learning_rate": 1e-06, + "loss": 0.0719, + "step": 1416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2048.0, + "completions/mean_length": 1917.099609375, + "completions/mean_terminated_length": 590.7100219726562, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.295977458357811, + "epoch": 3.7289473684210526, + "frac_reward_zero_std": 0.21875, + "grad_norm": 196.44320678710938, + "learning_rate": 1e-06, + "loss": 0.2175, + "num_tokens": 474441989.0, + "reward": 0.7707229852676392, + "reward_std": 0.23186513781547546, + "rewards/progression_diversity/mean": -0.03708141669631004, + "rewards/progression_diversity/std": 0.12436394393444061, + "rewards/symbolic_reward_accuracy/mean": 0.84765625, + "rewards/symbolic_reward_accuracy/std": 0.35970520973205566, + "rewards/symbolic_reward_partial_score/mean": 0.8990885019302368, + "rewards/symbolic_reward_partial_score/std": 0.27446648478507996, + "rewards/tag_count_reward/mean": -0.072265625, + "rewards/tag_count_reward/std": 0.2591804563999176, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0268856287002563, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 328.0, + "sampling/sampling_logp_difference/mean": 5.979172229766846, + "step": 1417 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.31002572178840637, + "epoch": 3.731578947368421, + "grad_norm": 1676.92724609375, + "learning_rate": 1e-06, + "loss": 0.4721, + "step": 1418 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3273525983095169, + "epoch": 3.734210526315789, + "grad_norm": 0.009710345417261124, + "learning_rate": 1e-06, + "loss": 0.0811, + "step": 1419 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.3278929442167282, + "epoch": 3.736842105263158, + "grad_norm": 0.014435559511184692, + "learning_rate": 1e-06, + "loss": 0.145, + "step": 1420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.11328125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2835.0, + "completions/mean_length": 2402.435546875, + "completions/mean_terminated_length": 616.2444458007812, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.308517649769783, + "epoch": 3.7394736842105263, + "frac_reward_zero_std": 0.1875, + "grad_norm": 69.96543884277344, + "learning_rate": 1e-06, + "loss": 0.1305, + "num_tokens": 476068004.0, + "reward": 0.755204439163208, + "reward_std": 0.24294275045394897, + "rewards/progression_diversity/mean": -0.050845738500356674, + "rewards/progression_diversity/std": 0.14557471871376038, + "rewards/symbolic_reward_accuracy/mean": 0.8359375, + "rewards/symbolic_reward_accuracy/std": 0.37069445848464966, + "rewards/symbolic_reward_partial_score/mean": 0.88037109375, + "rewards/symbolic_reward_partial_score/std": 0.296750545501709, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0160291194915771, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 328.0, + "sampling/sampling_logp_difference/mean": 7.112195014953613, + "step": 1421 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.29520660638809204, + "epoch": 3.7421052631578946, + "grad_norm": 1305.361083984375, + "learning_rate": 1e-06, + "loss": 0.298, + "step": 1422 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2944742292165756, + "epoch": 3.7447368421052634, + "grad_norm": 1.5552692413330078, + "learning_rate": 1e-06, + "loss": 0.1569, + "step": 1423 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2920891344547272, + "epoch": 3.7473684210526317, + "grad_norm": 0.0419452041387558, + "learning_rate": 1e-06, + "loss": 0.2029, + "step": 1424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2503.0, + "completions/mean_length": 2013.22265625, + "completions/mean_terminated_length": 594.6480712890625, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.30936600267887115, + "epoch": 3.75, + "frac_reward_zero_std": 0.4375, + "grad_norm": 129.8343048095703, + "learning_rate": 1e-06, + "loss": 0.0752, + "num_tokens": 477496566.0, + "reward": 0.7965672016143799, + "reward_std": 0.15301448106765747, + "rewards/progression_diversity/mean": -0.03566431626677513, + "rewards/progression_diversity/std": 0.11889361590147018, + "rewards/symbolic_reward_accuracy/mean": 0.87890625, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.9246419072151184, + "rewards/symbolic_reward_partial_score/std": 0.23886118829250336, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0362024307250977, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 330.0, + "sampling/sampling_logp_difference/mean": 4.878294467926025, + "step": 1425 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.32300904393196106, + "epoch": 3.7526315789473683, + "grad_norm": 0.02335791103541851, + "learning_rate": 1e-06, + "loss": 0.1367, + "step": 1426 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31392285227775574, + "epoch": 3.7552631578947366, + "grad_norm": 0.0137960035353899, + "learning_rate": 1e-06, + "loss": 0.1808, + "step": 1427 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3122747987508774, + "epoch": 3.7578947368421054, + "grad_norm": 0.0074070231057703495, + "learning_rate": 1e-06, + "loss": 0.1176, + "step": 1428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.103515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 11201.0, + "completions/mean_length": 2229.384765625, + "completions/mean_terminated_length": 594.9738159179688, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.3264199197292328, + "epoch": 3.7605263157894737, + "frac_reward_zero_std": 0.3125, + "grad_norm": 201.52215576171875, + "learning_rate": 1e-06, + "loss": 0.0746, + "num_tokens": 479012763.0, + "reward": 0.7792283296585083, + "reward_std": 0.16983628273010254, + "rewards/progression_diversity/mean": -0.041039757430553436, + "rewards/progression_diversity/std": 0.12305484712123871, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.91064453125, + "rewards/symbolic_reward_partial_score/std": 0.25674015283584595, + "rewards/tag_count_reward/mean": -0.091796875, + "rewards/tag_count_reward/std": 0.289021372795105, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0295979976654053, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 332.0, + "sampling/sampling_logp_difference/mean": 6.160717964172363, + "step": 1429 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.30331704020500183, + "epoch": 3.763157894736842, + "grad_norm": 0.012537804432213306, + "learning_rate": 1e-06, + "loss": 0.1338, + "step": 1430 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3093401789665222, + "epoch": 3.765789473684211, + "grad_norm": 0.014041773974895477, + "learning_rate": 1e-06, + "loss": 0.1339, + "step": 1431 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.29702627658843994, + "epoch": 3.768421052631579, + "grad_norm": 0.010763847269117832, + "learning_rate": 1e-06, + "loss": 0.2351, + "step": 1432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 2045.2734375, + "completions/mean_terminated_length": 629.8626708984375, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.32291169464588165, + "epoch": 3.7710526315789474, + "frac_reward_zero_std": 0.21875, + "grad_norm": 403.7364807128906, + "learning_rate": 1e-06, + "loss": 0.1425, + "num_tokens": 480464935.0, + "reward": 0.7736941576004028, + "reward_std": 0.2378438413143158, + "rewards/progression_diversity/mean": -0.03292561694979668, + "rewards/progression_diversity/std": 0.10725618153810501, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.9016926884651184, + "rewards/symbolic_reward_partial_score/std": 0.265513151884079, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.034686803817749, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 332.0, + "sampling/sampling_logp_difference/mean": 5.4915852546691895, + "step": 1433 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.3211905360221863, + "epoch": 3.7736842105263158, + "grad_norm": 0.015537317842245102, + "learning_rate": 1e-06, + "loss": 0.1144, + "step": 1434 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.33275802433490753, + "epoch": 3.776315789473684, + "grad_norm": 0.014047092758119106, + "learning_rate": 1e-06, + "loss": 0.117, + "step": 1435 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.3044770658016205, + "epoch": 3.7789473684210524, + "grad_norm": 0.010970460250973701, + "learning_rate": 1e-06, + "loss": 0.1902, + "step": 1436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.15234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12968.0, + "completions/mean_length": 3075.37890625, + "completions/mean_terminated_length": 683.5068969726562, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.2942586690187454, + "epoch": 3.781578947368421, + "frac_reward_zero_std": 0.125, + "grad_norm": 349.7894287109375, + "learning_rate": 1e-06, + "loss": 0.1484, + "num_tokens": 482467017.0, + "reward": 0.6715598702430725, + "reward_std": 0.2547609806060791, + "rewards/progression_diversity/mean": -0.05104883760213852, + "rewards/progression_diversity/std": 0.12435862421989441, + "rewards/symbolic_reward_accuracy/mean": 0.732421875, + "rewards/symbolic_reward_accuracy/std": 0.4431293308734894, + "rewards/symbolic_reward_partial_score/mean": 0.8229166269302368, + "rewards/symbolic_reward_partial_score/std": 0.34593749046325684, + "rewards/tag_count_reward/mean": -0.142578125, + "rewards/tag_count_reward/std": 0.3499840497970581, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0357654094696045, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 332.0, + "sampling/sampling_logp_difference/mean": 5.477702617645264, + "step": 1437 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3671875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.27393701672554016, + "epoch": 3.7842105263157895, + "grad_norm": 0.00826684758067131, + "learning_rate": 1e-06, + "loss": 0.2608, + "step": 1438 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.390625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5546875, + "entropy": 0.3037470281124115, + "epoch": 3.786842105263158, + "grad_norm": 0.006643475033342838, + "learning_rate": 1e-06, + "loss": 0.1667, + "step": 1439 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5234375, + "entropy": 0.28216809034347534, + "epoch": 3.7894736842105265, + "grad_norm": 0.01340513676404953, + "learning_rate": 1e-06, + "loss": 0.2354, + "step": 1440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3061.0, + "completions/mean_length": 1698.798828125, + "completions/mean_terminated_length": 588.1533813476562, + "completions/min_length": 237.0, + "completions/min_terminated_length": 237.0, + "entropy": 0.33806556463241577, + "epoch": 3.792105263157895, + "frac_reward_zero_std": 0.34375, + "grad_norm": 164.4355926513672, + "learning_rate": 1e-06, + "loss": 0.0763, + "num_tokens": 483743938.0, + "reward": 0.780125617980957, + "reward_std": 0.18881092965602875, + "rewards/progression_diversity/mean": -0.02455180510878563, + "rewards/progression_diversity/std": 0.09061013907194138, + "rewards/symbolic_reward_accuracy/mean": 0.849609375, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.9208984375, + "rewards/symbolic_reward_partial_score/std": 0.2368720918893814, + "rewards/tag_count_reward/mean": -0.056640625, + "rewards/tag_count_reward/std": 0.23138070106506348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0537469387054443, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 332.0, + "sampling/sampling_logp_difference/mean": 3.5584030151367188, + "step": 1441 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.31358209252357483, + "epoch": 3.794736842105263, + "grad_norm": 0.010208331979811192, + "learning_rate": 1e-06, + "loss": 0.1884, + "step": 1442 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3256896734237671, + "epoch": 3.7973684210526315, + "grad_norm": 0.05766785889863968, + "learning_rate": 1e-06, + "loss": 0.0753, + "step": 1443 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.32447347044944763, + "epoch": 3.8, + "grad_norm": 0.014500455930829048, + "learning_rate": 1e-06, + "loss": 0.1182, + "step": 1444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2773.0, + "completions/mean_length": 2110.767578125, + "completions/mean_terminated_length": 566.0454711914062, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.30462731420993805, + "epoch": 3.8026315789473686, + "frac_reward_zero_std": 0.21875, + "grad_norm": 307.380615234375, + "learning_rate": 1e-06, + "loss": 0.1169, + "num_tokens": 485208267.0, + "reward": 0.7762401700019836, + "reward_std": 0.18577754497528076, + "rewards/progression_diversity/mean": -0.032237473875284195, + "rewards/progression_diversity/std": 0.10042642056941986, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.9088541269302368, + "rewards/symbolic_reward_partial_score/std": 0.2546987235546112, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.053719401359558, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 336.0, + "sampling/sampling_logp_difference/mean": 3.9605154991149902, + "step": 1445 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3047986924648285, + "epoch": 3.805263157894737, + "grad_norm": 0.018491854891180992, + "learning_rate": 1e-06, + "loss": 0.1968, + "step": 1446 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3292408138513565, + "epoch": 3.807894736842105, + "grad_norm": 2.424530267715454, + "learning_rate": 1e-06, + "loss": 0.0355, + "step": 1447 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2958214730024338, + "epoch": 3.8105263157894735, + "grad_norm": 0.017129186540842056, + "learning_rate": 1e-06, + "loss": 0.2069, + "step": 1448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.1015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 16038.0, + "completions/mean_length": 2222.494140625, + "completions/mean_terminated_length": 621.6282348632812, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.3045518100261688, + "epoch": 3.8131578947368423, + "frac_reward_zero_std": 0.25, + "grad_norm": 98.93772888183594, + "learning_rate": 1e-06, + "loss": 0.1575, + "num_tokens": 486730280.0, + "reward": 0.7752275466918945, + "reward_std": 0.1814824491739273, + "rewards/progression_diversity/mean": -0.03583987057209015, + "rewards/progression_diversity/std": 0.10934660583734512, + "rewards/symbolic_reward_accuracy/mean": 0.857421875, + "rewards/symbolic_reward_accuracy/std": 0.3499840497970581, + "rewards/symbolic_reward_partial_score/mean": 0.8990885615348816, + "rewards/symbolic_reward_partial_score/std": 0.27117884159088135, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0510573387145996, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 336.0, + "sampling/sampling_logp_difference/mean": 4.26395320892334, + "step": 1449 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.31291763484477997, + "epoch": 3.8157894736842106, + "grad_norm": 20.84035301208496, + "learning_rate": 1e-06, + "loss": 0.1215, + "step": 1450 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.30178968608379364, + "epoch": 3.818421052631579, + "grad_norm": 0.08378605544567108, + "learning_rate": 1e-06, + "loss": 0.1589, + "step": 1451 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.30265994369983673, + "epoch": 3.8210526315789473, + "grad_norm": 0.11986826360225677, + "learning_rate": 1e-06, + "loss": 0.1607, + "step": 1452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 10509.0, + "completions/mean_length": 2041.810546875, + "completions/mean_terminated_length": 592.169921875, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.3079294115304947, + "epoch": 3.8236842105263156, + "frac_reward_zero_std": 0.21875, + "grad_norm": 155.3240509033203, + "learning_rate": 1e-06, + "loss": 0.1091, + "num_tokens": 488187015.0, + "reward": 0.7332264184951782, + "reward_std": 0.21755467355251312, + "rewards/progression_diversity/mean": -0.03185119852423668, + "rewards/progression_diversity/std": 0.10352633893489838, + "rewards/symbolic_reward_accuracy/mean": 0.791015625, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.8904622197151184, + "rewards/symbolic_reward_partial_score/std": 0.26156580448150635, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0546609163284302, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 336.0, + "sampling/sampling_logp_difference/mean": 4.185214042663574, + "step": 1453 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3079414367675781, + "epoch": 3.8263157894736843, + "grad_norm": 0.01617673970758915, + "learning_rate": 1e-06, + "loss": 0.1361, + "step": 1454 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4921875, + "entropy": 0.30259697139263153, + "epoch": 3.8289473684210527, + "grad_norm": 0.010684851557016373, + "learning_rate": 1e-06, + "loss": 0.1832, + "step": 1455 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.29774773120880127, + "epoch": 3.831578947368421, + "grad_norm": 0.026969242841005325, + "learning_rate": 1e-06, + "loss": 0.1849, + "step": 1456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15120.0, + "completions/mean_length": 2334.595703125, + "completions/mean_terminated_length": 609.2302856445312, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.28709380328655243, + "epoch": 3.8342105263157897, + "frac_reward_zero_std": 0.1875, + "grad_norm": 261.2757568359375, + "learning_rate": 1e-06, + "loss": 0.273, + "num_tokens": 489779800.0, + "reward": 0.741887092590332, + "reward_std": 0.2086125761270523, + "rewards/progression_diversity/mean": -0.039813414216041565, + "rewards/progression_diversity/std": 0.11546120047569275, + "rewards/symbolic_reward_accuracy/mean": 0.814453125, + "rewards/symbolic_reward_accuracy/std": 0.38912075757980347, + "rewards/symbolic_reward_partial_score/mean": 0.8785807490348816, + "rewards/symbolic_reward_partial_score/std": 0.2908572554588318, + "rewards/tag_count_reward/mean": -0.099609375, + "rewards/tag_count_reward/std": 0.29977133870124817, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0556657314300537, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 340.0, + "sampling/sampling_logp_difference/mean": 5.175328731536865, + "step": 1457 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3067042976617813, + "epoch": 3.836842105263158, + "grad_norm": 0.011344866827130318, + "learning_rate": 1e-06, + "loss": 0.1358, + "step": 1458 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4140625, + "entropy": 0.3160387873649597, + "epoch": 3.8394736842105264, + "grad_norm": 2.8747458457946777, + "learning_rate": 1e-06, + "loss": 0.0778, + "step": 1459 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.1875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.46875, + "entropy": 0.2950311750173569, + "epoch": 3.8421052631578947, + "grad_norm": 0.005016846116632223, + "learning_rate": 1e-06, + "loss": 0.1839, + "step": 1460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.08203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15092.0, + "completions/mean_length": 1931.142578125, + "completions/mean_terminated_length": 639.610595703125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.31487199664115906, + "epoch": 3.844736842105263, + "frac_reward_zero_std": 0.21875, + "grad_norm": 225.66839599609375, + "learning_rate": 1e-06, + "loss": 0.1183, + "num_tokens": 491169345.0, + "reward": 0.7955160140991211, + "reward_std": 0.22690117359161377, + "rewards/progression_diversity/mean": -0.033369071781635284, + "rewards/progression_diversity/std": 0.11018730700016022, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9132486581802368, + "rewards/symbolic_reward_partial_score/std": 0.2617155611515045, + "rewards/tag_count_reward/mean": -0.078125, + "rewards/tag_count_reward/std": 0.26863065361976624, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0568856000900269, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 340.0, + "sampling/sampling_logp_difference/mean": 4.69308614730835, + "step": 1461 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.31012292206287384, + "epoch": 3.8473684210526313, + "grad_norm": 0.009823931381106377, + "learning_rate": 1e-06, + "loss": 0.1637, + "step": 1462 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4375, + "entropy": 0.2923455536365509, + "epoch": 3.85, + "grad_norm": 0.34986069798469543, + "learning_rate": 1e-06, + "loss": 0.2035, + "step": 1463 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.3000039905309677, + "epoch": 3.8526315789473684, + "grad_norm": 0.01767495833337307, + "learning_rate": 1e-06, + "loss": 0.1856, + "step": 1464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9351.0, + "completions/mean_length": 1834.5859375, + "completions/mean_terminated_length": 634.9513549804688, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.3126460313796997, + "epoch": 3.8552631578947367, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1094.819580078125, + "learning_rate": 1e-06, + "loss": 0.1071, + "num_tokens": 492513805.0, + "reward": 0.763613224029541, + "reward_std": 0.17882443964481354, + "rewards/progression_diversity/mean": -0.03028068132698536, + "rewards/progression_diversity/std": 0.10648433864116669, + "rewards/symbolic_reward_accuracy/mean": 0.828125, + "rewards/symbolic_reward_accuracy/std": 0.3776407241821289, + "rewards/symbolic_reward_partial_score/mean": 0.9070637822151184, + "rewards/symbolic_reward_partial_score/std": 0.24986954033374786, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0567845106124878, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 340.0, + "sampling/sampling_logp_difference/mean": 4.487025260925293, + "step": 1465 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2963457703590393, + "epoch": 3.8578947368421055, + "grad_norm": 551.8005981445312, + "learning_rate": 1e-06, + "loss": 0.2159, + "step": 1466 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.30269500613212585, + "epoch": 3.860526315789474, + "grad_norm": 30.8673152923584, + "learning_rate": 1e-06, + "loss": 0.1849, + "step": 1467 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.31897950172424316, + "epoch": 3.863157894736842, + "grad_norm": 0.007765746209770441, + "learning_rate": 1e-06, + "loss": 0.0966, + "step": 1468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.09375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 14826.0, + "completions/mean_length": 2125.513671875, + "completions/mean_terminated_length": 650.4978637695312, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.2966301888227463, + "epoch": 3.8657894736842104, + "frac_reward_zero_std": 0.15625, + "grad_norm": 295.4943542480469, + "learning_rate": 1e-06, + "loss": 0.2116, + "num_tokens": 493999860.0, + "reward": 0.7747255563735962, + "reward_std": 0.2700809836387634, + "rewards/progression_diversity/mean": -0.04209459200501442, + "rewards/progression_diversity/std": 0.12887020409107208, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.89697265625, + "rewards/symbolic_reward_partial_score/std": 0.2792568802833557, + "rewards/tag_count_reward/mean": -0.095703125, + "rewards/tag_count_reward/std": 0.2944713830947876, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0543975830078125, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 342.0, + "sampling/sampling_logp_difference/mean": 4.8566389083862305, + "step": 1469 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2987577021121979, + "epoch": 3.8684210526315788, + "grad_norm": 2.0050089359283447, + "learning_rate": 1e-06, + "loss": 0.125, + "step": 1470 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.31162601709365845, + "epoch": 3.8710526315789475, + "grad_norm": 0.4049341678619385, + "learning_rate": 1e-06, + "loss": 0.0939, + "step": 1471 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4453125, + "entropy": 0.27791038155555725, + "epoch": 3.873684210526316, + "grad_norm": 2.2584075927734375, + "learning_rate": 1e-06, + "loss": 0.2615, + "step": 1472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.126953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3494.0, + "completions/mean_length": 2676.2734375, + "completions/mean_terminated_length": 682.9798583984375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.2749413251876831, + "epoch": 3.876315789473684, + "frac_reward_zero_std": 0.21875, + "grad_norm": 352.43115234375, + "learning_rate": 1e-06, + "loss": 0.1901, + "num_tokens": 495803296.0, + "reward": 0.6940611600875854, + "reward_std": 0.22690893709659576, + "rewards/progression_diversity/mean": -0.047005537897348404, + "rewards/progression_diversity/std": 0.13115794956684113, + "rewards/symbolic_reward_accuracy/mean": 0.75390625, + "rewards/symbolic_reward_accuracy/std": 0.4311550557613373, + "rewards/symbolic_reward_partial_score/mean": 0.845703125, + "rewards/symbolic_reward_partial_score/std": 0.31764960289001465, + "rewards/tag_count_reward/mean": -0.115234375, + "rewards/tag_count_reward/std": 0.3196168541908264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0539484024047852, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 342.0, + "sampling/sampling_logp_difference/mean": 4.197028636932373, + "step": 1473 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.2936839610338211, + "epoch": 3.8789473684210525, + "grad_norm": 0.012438047677278519, + "learning_rate": 1e-06, + "loss": 0.1267, + "step": 1474 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3515625, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4765625, + "entropy": 0.3024512231349945, + "epoch": 3.8815789473684212, + "grad_norm": 0.14518336951732635, + "learning_rate": 1e-06, + "loss": 0.1382, + "step": 1475 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.5, + "entropy": 0.2862910181283951, + "epoch": 3.8842105263157896, + "grad_norm": 0.021601000800728798, + "learning_rate": 1e-06, + "loss": 0.2073, + "step": 1476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.072265625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5602.0, + "completions/mean_length": 1790.48828125, + "completions/mean_terminated_length": 653.73046875, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.2987203449010849, + "epoch": 3.886842105263158, + "frac_reward_zero_std": 0.28125, + "grad_norm": 451.8078308105469, + "learning_rate": 1e-06, + "loss": 0.2304, + "num_tokens": 497139354.0, + "reward": 0.7696309089660645, + "reward_std": 0.18139323592185974, + "rewards/progression_diversity/mean": -0.024213135242462158, + "rewards/progression_diversity/std": 0.09224303066730499, + "rewards/symbolic_reward_accuracy/mean": 0.845703125, + "rewards/symbolic_reward_accuracy/std": 0.36158639192581177, + "rewards/symbolic_reward_partial_score/mean": 0.8963216543197632, + "rewards/symbolic_reward_partial_score/std": 0.2699549198150635, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.065273404121399, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 340.0, + "sampling/sampling_logp_difference/mean": 2.8104472160339355, + "step": 1477 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3077242374420166, + "epoch": 3.889473684210526, + "grad_norm": 75.5800552368164, + "learning_rate": 1e-06, + "loss": 0.1149, + "step": 1478 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.32739219069480896, + "epoch": 3.8921052631578945, + "grad_norm": 0.022421207278966904, + "learning_rate": 1e-06, + "loss": 0.0204, + "step": 1479 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4609375, + "entropy": 0.3064194321632385, + "epoch": 3.8947368421052633, + "grad_norm": 0.005432396661490202, + "learning_rate": 1e-06, + "loss": 0.1348, + "step": 1480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2170.0, + "completions/mean_length": 2075.45703125, + "completions/mean_terminated_length": 629.2172241210938, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.29528728127479553, + "epoch": 3.8973684210526316, + "frac_reward_zero_std": 0.34375, + "grad_norm": 82.80768585205078, + "learning_rate": 1e-06, + "loss": 0.0844, + "num_tokens": 498619812.0, + "reward": 0.7649828791618347, + "reward_std": 0.20222607254981995, + "rewards/progression_diversity/mean": -0.034921687096357346, + "rewards/progression_diversity/std": 0.11383773386478424, + "rewards/symbolic_reward_accuracy/mean": 0.837890625, + "rewards/symbolic_reward_accuracy/std": 0.3689115643501282, + "rewards/symbolic_reward_partial_score/mean": 0.9026692509651184, + "rewards/symbolic_reward_partial_score/std": 0.26592451333999634, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0633716583251953, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 342.0, + "sampling/sampling_logp_difference/mean": 3.5869860649108887, + "step": 1481 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.3259361535310745, + "epoch": 3.9, + "grad_norm": 0.030284911394119263, + "learning_rate": 1e-06, + "loss": 0.0843, + "step": 1482 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.2944328784942627, + "epoch": 3.9026315789473687, + "grad_norm": 0.011786861345171928, + "learning_rate": 1e-06, + "loss": 0.2573, + "step": 1483 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.30443601310253143, + "epoch": 3.905263157894737, + "grad_norm": 0.02485356479883194, + "learning_rate": 1e-06, + "loss": 0.1844, + "step": 1484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2390.0, + "completions/mean_length": 1519.84375, + "completions/mean_terminated_length": 594.6888427734375, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.32855215668678284, + "epoch": 3.9078947368421053, + "frac_reward_zero_std": 0.3125, + "grad_norm": 51.70481491088867, + "learning_rate": 1e-06, + "loss": 0.1191, + "num_tokens": 499803124.0, + "reward": 0.8123822212219238, + "reward_std": 0.18835322558879852, + "rewards/progression_diversity/mean": -0.021549619734287262, + "rewards/progression_diversity/std": 0.08892330527305603, + "rewards/symbolic_reward_accuracy/mean": 0.8984375, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.9287109375, + "rewards/symbolic_reward_partial_score/std": 0.23190389573574066, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0671436786651611, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 342.0, + "sampling/sampling_logp_difference/mean": 3.182962417602539, + "step": 1485 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.30832575261592865, + "epoch": 3.9105263157894736, + "grad_norm": 0.009262947365641594, + "learning_rate": 1e-06, + "loss": 0.1818, + "step": 1486 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.3187596797943115, + "epoch": 3.913157894736842, + "grad_norm": 0.00602568918839097, + "learning_rate": 1e-06, + "loss": 0.1425, + "step": 1487 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3167625367641449, + "epoch": 3.9157894736842103, + "grad_norm": 0.01127164252102375, + "learning_rate": 1e-06, + "loss": 0.1257, + "step": 1488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2527.0, + "completions/mean_length": 1434.26953125, + "completions/mean_terminated_length": 569.4090576171875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.3199300616979599, + "epoch": 3.918421052631579, + "frac_reward_zero_std": 0.3125, + "grad_norm": 47.4603157043457, + "learning_rate": 1e-06, + "loss": 0.1406, + "num_tokens": 500914430.0, + "reward": 0.8060013055801392, + "reward_std": 0.2019416242837906, + "rewards/progression_diversity/mean": -0.019987143576145172, + "rewards/progression_diversity/std": 0.088285431265831, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9295247793197632, + "rewards/symbolic_reward_partial_score/std": 0.22341255843639374, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0701146125793457, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 344.0, + "sampling/sampling_logp_difference/mean": 2.487426280975342, + "step": 1489 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.32358092069625854, + "epoch": 3.9210526315789473, + "grad_norm": 5.1159563064575195, + "learning_rate": 1e-06, + "loss": 0.074, + "step": 1490 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3156203180551529, + "epoch": 3.9236842105263157, + "grad_norm": 19.548402786254883, + "learning_rate": 1e-06, + "loss": 0.0988, + "step": 1491 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.321828693151474, + "epoch": 3.9263157894736844, + "grad_norm": 95.19469451904297, + "learning_rate": 1e-06, + "loss": 0.1048, + "step": 1492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9638.0, + "completions/mean_length": 1489.11328125, + "completions/mean_terminated_length": 594.8033447265625, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.31046292185783386, + "epoch": 3.9289473684210527, + "frac_reward_zero_std": 0.34375, + "grad_norm": 160.60206604003906, + "learning_rate": 1e-06, + "loss": 0.1243, + "num_tokens": 502064824.0, + "reward": 0.8175588250160217, + "reward_std": 0.18224866688251495, + "rewards/progression_diversity/mean": -0.02145996317267418, + "rewards/progression_diversity/std": 0.08906258642673492, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.9342448115348816, + "rewards/symbolic_reward_partial_score/std": 0.2271050065755844, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0690562725067139, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 2.968571662902832, + "step": 1493 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.33640483021736145, + "epoch": 3.931578947368421, + "grad_norm": 0.021926378831267357, + "learning_rate": 1e-06, + "loss": 0.0626, + "step": 1494 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.29776348173618317, + "epoch": 3.9342105263157894, + "grad_norm": 0.009869229048490524, + "learning_rate": 1e-06, + "loss": 0.1971, + "step": 1495 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.310922309756279, + "epoch": 3.9368421052631577, + "grad_norm": 0.004013043362647295, + "learning_rate": 1e-06, + "loss": 0.0687, + "step": 1496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9479.0, + "completions/mean_length": 1596.375, + "completions/mean_terminated_length": 577.6033325195312, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.32028135657310486, + "epoch": 3.9394736842105265, + "frac_reward_zero_std": 0.375, + "grad_norm": 302.3440856933594, + "learning_rate": 1e-06, + "loss": 0.1131, + "num_tokens": 503281304.0, + "reward": 0.7774491906166077, + "reward_std": 0.17322488129138947, + "rewards/progression_diversity/mean": -0.02363717183470726, + "rewards/progression_diversity/std": 0.09269430488348007, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.91064453125, + "rewards/symbolic_reward_partial_score/std": 0.2503076493740082, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0702153444290161, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 350.0, + "sampling/sampling_logp_difference/mean": 2.403035879135132, + "step": 1497 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.31212155520915985, + "epoch": 3.942105263157895, + "grad_norm": 0.007395219057798386, + "learning_rate": 1e-06, + "loss": 0.108, + "step": 1498 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.30698293447494507, + "epoch": 3.944736842105263, + "grad_norm": 0.010542972013354301, + "learning_rate": 1e-06, + "loss": 0.1506, + "step": 1499 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31951723992824554, + "epoch": 3.9473684210526314, + "grad_norm": 0.01814758963882923, + "learning_rate": 1e-06, + "loss": 0.0871, + "step": 1500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9357.0, + "completions/mean_length": 1486.361328125, + "completions/mean_terminated_length": 624.5144653320312, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.32599151134490967, + "epoch": 3.95, + "frac_reward_zero_std": 0.40625, + "grad_norm": 6.691247940063477, + "learning_rate": 1e-06, + "loss": 0.0441, + "num_tokens": 504458641.0, + "reward": 0.7741117477416992, + "reward_std": 0.17628371715545654, + "rewards/progression_diversity/mean": -0.02046554908156395, + "rewards/progression_diversity/std": 0.08719975501298904, + "rewards/symbolic_reward_accuracy/mean": 0.841796875, + "rewards/symbolic_reward_accuracy/std": 0.36528825759887695, + "rewards/symbolic_reward_partial_score/mean": 0.9143880605697632, + "rewards/symbolic_reward_partial_score/std": 0.23354806005954742, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0702029466629028, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.0351459980010986, + "step": 1501 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.32493776082992554, + "epoch": 3.9526315789473685, + "grad_norm": 1.328794240951538, + "learning_rate": 1e-06, + "loss": 0.0413, + "step": 1502 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3139204680919647, + "epoch": 3.955263157894737, + "grad_norm": 0.007978091016411781, + "learning_rate": 1e-06, + "loss": 0.1326, + "step": 1503 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3053445369005203, + "epoch": 3.957894736842105, + "grad_norm": 1.1158732175827026, + "learning_rate": 1e-06, + "loss": 0.1221, + "step": 1504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2461.0, + "completions/mean_length": 1384.337890625, + "completions/mean_terminated_length": 614.334716796875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.30869564414024353, + "epoch": 3.9605263157894735, + "frac_reward_zero_std": 0.4375, + "grad_norm": 96.13636016845703, + "learning_rate": 1e-06, + "loss": 0.1313, + "num_tokens": 505584222.0, + "reward": 0.8070886135101318, + "reward_std": 0.16022410988807678, + "rewards/progression_diversity/mean": -0.018680397421121597, + "rewards/progression_diversity/std": 0.0852426066994667, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9318033456802368, + "rewards/symbolic_reward_partial_score/std": 0.22796770930290222, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0674837827682495, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 2.188668727874756, + "step": 1505 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3231370598077774, + "epoch": 3.963157894736842, + "grad_norm": 2.7390880584716797, + "learning_rate": 1e-06, + "loss": 0.0322, + "step": 1506 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.31320738792419434, + "epoch": 3.9657894736842105, + "grad_norm": 3.7349812984466553, + "learning_rate": 1e-06, + "loss": 0.1126, + "step": 1507 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.30574825406074524, + "epoch": 3.968421052631579, + "grad_norm": 0.009325175546109676, + "learning_rate": 1e-06, + "loss": 0.1404, + "step": 1508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2576.0, + "completions/mean_length": 1660.708984375, + "completions/mean_terminated_length": 613.4456176757812, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.32026034593582153, + "epoch": 3.9710526315789476, + "frac_reward_zero_std": 0.21875, + "grad_norm": 28.75910186767578, + "learning_rate": 1e-06, + "loss": 0.057, + "num_tokens": 506864681.0, + "reward": 0.7385144233703613, + "reward_std": 0.21706028282642365, + "rewards/progression_diversity/mean": -0.020627107471227646, + "rewards/progression_diversity/std": 0.08610701560974121, + "rewards/symbolic_reward_accuracy/mean": 0.791015625, + "rewards/symbolic_reward_accuracy/std": 0.40698084235191345, + "rewards/symbolic_reward_partial_score/mean": 0.9012044072151184, + "rewards/symbolic_reward_partial_score/std": 0.24362309277057648, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.06502366065979, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 346.0, + "sampling/sampling_logp_difference/mean": 2.6898069381713867, + "step": 1509 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.421875, + "entropy": 0.3142033517360687, + "epoch": 3.973684210526316, + "grad_norm": 171.92105102539062, + "learning_rate": 1e-06, + "loss": 0.0856, + "step": 1510 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1484375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.28372959792613983, + "epoch": 3.9763157894736842, + "grad_norm": 300.263427734375, + "learning_rate": 1e-06, + "loss": 0.1618, + "step": 1511 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.3046766221523285, + "epoch": 3.9789473684210526, + "grad_norm": 0.013155936263501644, + "learning_rate": 1e-06, + "loss": 0.1099, + "step": 1512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2665.0, + "completions/mean_length": 1467.025390625, + "completions/mean_terminated_length": 636.5958862304688, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "entropy": 0.3126714378595352, + "epoch": 3.981578947368421, + "frac_reward_zero_std": 0.34375, + "grad_norm": 230.4984588623047, + "learning_rate": 1e-06, + "loss": 0.1214, + "num_tokens": 508025942.0, + "reward": 0.7970232963562012, + "reward_std": 0.1775650829076767, + "rewards/progression_diversity/mean": -0.019354067742824554, + "rewards/progression_diversity/std": 0.08400935679674149, + "rewards/symbolic_reward_accuracy/mean": 0.873046875, + "rewards/symbolic_reward_accuracy/std": 0.33324605226516724, + "rewards/symbolic_reward_partial_score/mean": 0.9275715947151184, + "rewards/symbolic_reward_partial_score/std": 0.22987112402915955, + "rewards/tag_count_reward/mean": -0.048828125, + "rewards/tag_count_reward/std": 0.2157193273305893, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0662436485290527, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 2.509793281555176, + "step": 1513 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.31753115355968475, + "epoch": 3.984210526315789, + "grad_norm": 0.017170730978250504, + "learning_rate": 1e-06, + "loss": 0.1007, + "step": 1514 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.2985118627548218, + "epoch": 3.986842105263158, + "grad_norm": 0.0552213080227375, + "learning_rate": 1e-06, + "loss": 0.127, + "step": 1515 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31124047935009, + "epoch": 3.9894736842105263, + "grad_norm": 0.018329912796616554, + "learning_rate": 1e-06, + "loss": 0.0722, + "step": 1516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.083984375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2271.0, + "completions/mean_length": 1958.474609375, + "completions/mean_terminated_length": 635.8784790039062, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.285673588514328, + "epoch": 3.9921052631578946, + "frac_reward_zero_std": 0.28125, + "grad_norm": 261.71087646484375, + "learning_rate": 1e-06, + "loss": 0.2431, + "num_tokens": 509438505.0, + "reward": 0.7900243997573853, + "reward_std": 0.1837175190448761, + "rewards/progression_diversity/mean": -0.03076472133398056, + "rewards/progression_diversity/std": 0.10606426000595093, + "rewards/symbolic_reward_accuracy/mean": 0.875, + "rewards/symbolic_reward_accuracy/std": 0.3310423493385315, + "rewards/symbolic_reward_partial_score/mean": 0.9117838144302368, + "rewards/symbolic_reward_partial_score/std": 0.26197776198387146, + "rewards/tag_count_reward/mean": -0.08203125, + "rewards/tag_count_reward/std": 0.2746807038784027, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0607240200042725, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 3.408790111541748, + "step": 1517 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.3019895553588867, + "epoch": 3.9947368421052634, + "grad_norm": 476.9878845214844, + "learning_rate": 1e-06, + "loss": 0.1528, + "step": 1518 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.29304228723049164, + "epoch": 3.9973684210526317, + "grad_norm": 0.3619004487991333, + "learning_rate": 1e-06, + "loss": 0.1619, + "step": 1519 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.3019654005765915, + "epoch": 4.0, + "grad_norm": 0.011948428116738796, + "learning_rate": 1e-06, + "loss": 0.1182, + "step": 1520 + }, + { + "epoch": 4.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.034423828125, + "eval_completions/max_length": 15897.96875, + "eval_completions/max_terminated_length": 1909.625, + "eval_completions/mean_length": 1054.68017578125, + "eval_completions/mean_terminated_length": 508.42947006225586, + "eval_completions/min_length": 225.25, + "eval_completions/min_terminated_length": 225.25, + "eval_entropy": 0.3078602785244584, + "eval_frac_reward_zero_std": 0.30078125, + "eval_loss": 0.03949446976184845, + "eval_num_tokens": 509438505.0, + "eval_reward": 0.8282011039555073, + "eval_reward_std": 0.17971097235567868, + "eval_rewards/progression_diversity/mean": -0.013755144311289769, + "eval_rewards/progression_diversity/std": 0.07045977615052834, + "eval_rewards/symbolic_reward_accuracy/mean": 0.91259765625, + "eval_rewards/symbolic_reward_accuracy/std": 0.2780101113021374, + "eval_rewards/symbolic_reward_partial_score/mean": 0.9448852557688951, + "eval_rewards/symbolic_reward_partial_score/std": 0.1947934958152473, + "eval_rewards/tag_count_reward/mean": -0.02685546875, + "eval_rewards/tag_count_reward/std": 0.15172951598651707, + "eval_runtime": 3921.274, + "eval_samples_per_second": 0.064, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0713170282542706, + "eval_sampling/importance_sampling_ratio/min": 2.523935052067827e-07, + "eval_sampling/sampling_logp_difference/max": 336.4912216961384, + "eval_sampling/sampling_logp_difference/mean": 1.0463141361251473, + "eval_steps_per_second": 0.001, + "step": 1520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2840.0, + "completions/mean_length": 2191.412109375, + "completions/mean_terminated_length": 621.2993774414062, + "completions/min_length": 256.0, + "completions/min_terminated_length": 256.0, + "entropy": 0.27799971401691437, + "epoch": 4.002631578947368, + "frac_reward_zero_std": 0.28125, + "grad_norm": 205.81446838378906, + "learning_rate": 1e-06, + "loss": 0.1941, + "num_tokens": 510976988.0, + "reward": 0.755181074142456, + "reward_std": 0.1851288080215454, + "rewards/progression_diversity/mean": -0.038541071116924286, + "rewards/progression_diversity/std": 0.1202431246638298, + "rewards/symbolic_reward_accuracy/mean": 0.833984375, + "rewards/symbolic_reward_accuracy/std": 0.3724585771560669, + "rewards/symbolic_reward_partial_score/mean": 0.8818359375, + "rewards/symbolic_reward_partial_score/std": 0.2894873321056366, + "rewards/tag_count_reward/mean": -0.09375, + "rewards/tag_count_reward/std": 0.29176566004753113, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.055989146232605, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 4.068850994110107, + "step": 1521 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.30569666624069214, + "epoch": 4.005263157894737, + "grad_norm": 0.4432198405265808, + "learning_rate": 1e-06, + "loss": 0.0885, + "step": 1522 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.28277716040611267, + "epoch": 4.007894736842105, + "grad_norm": 0.008685186505317688, + "learning_rate": 1e-06, + "loss": 0.1703, + "step": 1523 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.40625, + "entropy": 0.2912045568227768, + "epoch": 4.010526315789473, + "grad_norm": 0.018372351303696632, + "learning_rate": 1e-06, + "loss": 0.1348, + "step": 1524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2278.0, + "completions/mean_length": 1152.287109375, + "completions/mean_terminated_length": 565.263671875, + "completions/min_length": 218.0, + "completions/min_terminated_length": 218.0, + "entropy": 0.32067057490348816, + "epoch": 4.0131578947368425, + "frac_reward_zero_std": 0.5, + "grad_norm": 151.9108428955078, + "learning_rate": 1e-06, + "loss": 0.1036, + "num_tokens": 511949071.0, + "reward": 0.8358957767486572, + "reward_std": 0.15225106477737427, + "rewards/progression_diversity/mean": -0.013942432589828968, + "rewards/progression_diversity/std": 0.07440164685249329, + "rewards/symbolic_reward_accuracy/mean": 0.923828125, + "rewards/symbolic_reward_accuracy/std": 0.26553234457969666, + "rewards/symbolic_reward_partial_score/mean": 0.9501953125, + "rewards/symbolic_reward_partial_score/std": 0.1953859031200409, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0619966983795166, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 2.296659469604492, + "step": 1525 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3325800895690918, + "epoch": 4.015789473684211, + "grad_norm": 0.02713238075375557, + "learning_rate": 1e-06, + "loss": 0.0424, + "step": 1526 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3067079782485962, + "epoch": 4.018421052631579, + "grad_norm": 0.009789610281586647, + "learning_rate": 1e-06, + "loss": 0.1041, + "step": 1527 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3153266906738281, + "epoch": 4.021052631578947, + "grad_norm": 0.003507691202685237, + "learning_rate": 1e-06, + "loss": 0.0967, + "step": 1528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2340.0, + "completions/mean_length": 1278.15234375, + "completions/mean_terminated_length": 599.9306030273438, + "completions/min_length": 239.0, + "completions/min_terminated_length": 239.0, + "entropy": 0.3092862367630005, + "epoch": 4.023684210526316, + "frac_reward_zero_std": 0.375, + "grad_norm": 305.3227233886719, + "learning_rate": 1e-06, + "loss": 0.0672, + "num_tokens": 513002301.0, + "reward": 0.7656987905502319, + "reward_std": 0.17219382524490356, + "rewards/progression_diversity/mean": -0.017035705968737602, + "rewards/progression_diversity/std": 0.08615429699420929, + "rewards/symbolic_reward_accuracy/mean": 0.81640625, + "rewards/symbolic_reward_accuracy/std": 0.3875311613082886, + "rewards/symbolic_reward_partial_score/mean": 0.93115234375, + "rewards/symbolic_reward_partial_score/std": 0.20142759382724762, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0569286346435547, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 2.1248581409454346, + "step": 1529 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3113991618156433, + "epoch": 4.026315789473684, + "grad_norm": 58.646968841552734, + "learning_rate": 1e-06, + "loss": 0.1007, + "step": 1530 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.3031221777200699, + "epoch": 4.028947368421052, + "grad_norm": 0.011349079199135303, + "learning_rate": 1e-06, + "loss": 0.1034, + "step": 1531 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.32167190313339233, + "epoch": 4.031578947368421, + "grad_norm": 0.010632401332259178, + "learning_rate": 1e-06, + "loss": 0.0584, + "step": 1532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2379.0, + "completions/mean_length": 1502.974609375, + "completions/mean_terminated_length": 576.7697143554688, + "completions/min_length": 204.0, + "completions/min_terminated_length": 204.0, + "entropy": 0.3021652102470398, + "epoch": 4.03421052631579, + "frac_reward_zero_std": 0.46875, + "grad_norm": 165.5340576171875, + "learning_rate": 1e-06, + "loss": 0.1347, + "num_tokens": 514159952.0, + "reward": 0.8022516965866089, + "reward_std": 0.11929187178611755, + "rewards/progression_diversity/mean": -0.023855865001678467, + "rewards/progression_diversity/std": 0.0998440608382225, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9249674081802368, + "rewards/symbolic_reward_partial_score/std": 0.23110011219978333, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0544638633728027, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 2.4234862327575684, + "step": 1533 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3111570328474045, + "epoch": 4.036842105263158, + "grad_norm": 0.01784106157720089, + "learning_rate": 1e-06, + "loss": 0.1066, + "step": 1534 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3170369863510132, + "epoch": 4.0394736842105265, + "grad_norm": 3.104100227355957, + "learning_rate": 1e-06, + "loss": 0.0767, + "step": 1535 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3106560707092285, + "epoch": 4.042105263157895, + "grad_norm": 0.010589975863695145, + "learning_rate": 1e-06, + "loss": 0.1156, + "step": 1536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2955.0, + "completions/mean_length": 909.375, + "completions/mean_terminated_length": 537.9840087890625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.33030684292316437, + "epoch": 4.044736842105263, + "frac_reward_zero_std": 0.4375, + "grad_norm": 51.92629623413086, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 515009328.0, + "reward": 0.8129271268844604, + "reward_std": 0.14900250732898712, + "rewards/progression_diversity/mean": -0.011006257496774197, + "rewards/progression_diversity/std": 0.07234536856412888, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9510090947151184, + "rewards/symbolic_reward_partial_score/std": 0.1690969616174698, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0632414817810059, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 1.5446823835372925, + "step": 1537 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.32397404313087463, + "epoch": 4.0473684210526315, + "grad_norm": 0.022417036816477776, + "learning_rate": 1e-06, + "loss": 0.0486, + "step": 1538 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3341270387172699, + "epoch": 4.05, + "grad_norm": 0.015315905213356018, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 1539 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.31841354072093964, + "epoch": 4.052631578947368, + "grad_norm": 61.291114807128906, + "learning_rate": 1e-06, + "loss": 0.0999, + "step": 1540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2456.0, + "completions/mean_length": 1415.52734375, + "completions/mean_terminated_length": 582.23095703125, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.31011515855789185, + "epoch": 4.0552631578947365, + "frac_reward_zero_std": 0.40625, + "grad_norm": 137.6245574951172, + "learning_rate": 1e-06, + "loss": 0.0506, + "num_tokens": 516117534.0, + "reward": 0.7666909694671631, + "reward_std": 0.1560915857553482, + "rewards/progression_diversity/mean": -0.02036316879093647, + "rewards/progression_diversity/std": 0.09054167568683624, + "rewards/symbolic_reward_accuracy/mean": 0.83203125, + "rewards/symbolic_reward_accuracy/std": 0.374204158782959, + "rewards/symbolic_reward_partial_score/mean": 0.9065755009651184, + "rewards/symbolic_reward_partial_score/std": 0.24309590458869934, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.058464765548706, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.0097904205322266, + "step": 1541 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.32095426321029663, + "epoch": 4.057894736842106, + "grad_norm": 0.008144257590174675, + "learning_rate": 1e-06, + "loss": 0.0401, + "step": 1542 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.30426955223083496, + "epoch": 4.060526315789474, + "grad_norm": 0.016386935487389565, + "learning_rate": 1e-06, + "loss": 0.119, + "step": 1543 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3272694945335388, + "epoch": 4.063157894736842, + "grad_norm": 39.390586853027344, + "learning_rate": 1e-06, + "loss": 0.0768, + "step": 1544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3431.0, + "completions/mean_length": 1677.0234375, + "completions/mean_terminated_length": 597.8951416015625, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.3231520652770996, + "epoch": 4.065789473684211, + "frac_reward_zero_std": 0.375, + "grad_norm": 80.80924987792969, + "learning_rate": 1e-06, + "loss": 0.0269, + "num_tokens": 517367626.0, + "reward": 0.7787384390830994, + "reward_std": 0.16898325085639954, + "rewards/progression_diversity/mean": -0.02655177377164364, + "rewards/progression_diversity/std": 0.1030859723687172, + "rewards/symbolic_reward_accuracy/mean": 0.849609375, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.9169921875, + "rewards/symbolic_reward_partial_score/std": 0.24586138129234314, + "rewards/tag_count_reward/mean": -0.05859375, + "rewards/tag_count_reward/std": 0.23509246110916138, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0506144762039185, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 4.009866714477539, + "step": 1545 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.3029029667377472, + "epoch": 4.068421052631579, + "grad_norm": 0.010388639755547047, + "learning_rate": 1e-06, + "loss": 0.1362, + "step": 1546 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3171136975288391, + "epoch": 4.071052631578947, + "grad_norm": 0.00802148599177599, + "learning_rate": 1e-06, + "loss": 0.06, + "step": 1547 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.15625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.28475718200206757, + "epoch": 4.073684210526316, + "grad_norm": 0.005422931630164385, + "learning_rate": 1e-06, + "loss": 0.291, + "step": 1548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2800.0, + "completions/mean_length": 1218.509765625, + "completions/mean_terminated_length": 665.9210815429688, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.32124534249305725, + "epoch": 4.076315789473684, + "frac_reward_zero_std": 0.3125, + "grad_norm": 359.06634521484375, + "learning_rate": 1e-06, + "loss": 0.0374, + "num_tokens": 518415663.0, + "reward": 0.8058355450630188, + "reward_std": 0.17662674188613892, + "rewards/progression_diversity/mean": -0.012153420597314835, + "rewards/progression_diversity/std": 0.06827174872159958, + "rewards/symbolic_reward_accuracy/mean": 0.87890625, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.9397786259651184, + "rewards/symbolic_reward_partial_score/std": 0.19970273971557617, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0600073337554932, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.2380099296569824, + "step": 1549 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3046875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.375, + "entropy": 0.29867467284202576, + "epoch": 4.078947368421052, + "grad_norm": 0.015004157088696957, + "learning_rate": 1e-06, + "loss": 0.1029, + "step": 1550 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.3169206827878952, + "epoch": 4.081578947368421, + "grad_norm": 0.020958561450242996, + "learning_rate": 1e-06, + "loss": 0.0588, + "step": 1551 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.3124808520078659, + "epoch": 4.08421052631579, + "grad_norm": 0.011239241808652878, + "learning_rate": 1e-06, + "loss": 0.0997, + "step": 1552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3201.0, + "completions/mean_length": 1185.564453125, + "completions/mean_terminated_length": 599.823486328125, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.315788209438324, + "epoch": 4.086842105263158, + "frac_reward_zero_std": 0.4375, + "grad_norm": 902.8833618164062, + "learning_rate": 1e-06, + "loss": 0.1105, + "num_tokens": 519423472.0, + "reward": 0.816677451133728, + "reward_std": 0.13706813752651215, + "rewards/progression_diversity/mean": -0.011946848593652248, + "rewards/progression_diversity/std": 0.0664309710264206, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.9440103769302368, + "rewards/symbolic_reward_partial_score/std": 0.19122956693172455, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0609365701675415, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.0578854084014893, + "step": 1553 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3234410434961319, + "epoch": 4.089473684210526, + "grad_norm": 0.014460497535765171, + "learning_rate": 1e-06, + "loss": 0.0497, + "step": 1554 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.31574487686157227, + "epoch": 4.092105263157895, + "grad_norm": 0.00562589755281806, + "learning_rate": 1e-06, + "loss": 0.0687, + "step": 1555 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.30954110622406006, + "epoch": 4.094736842105263, + "grad_norm": 0.011875905096530914, + "learning_rate": 1e-06, + "loss": 0.0327, + "step": 1556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3258.0, + "completions/mean_length": 1236.6171875, + "completions/mean_terminated_length": 588.7658081054688, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "entropy": 0.317048043012619, + "epoch": 4.097368421052631, + "frac_reward_zero_std": 0.46875, + "grad_norm": 72.20258331298828, + "learning_rate": 1e-06, + "loss": 0.0398, + "num_tokens": 520449900.0, + "reward": 0.8271297216415405, + "reward_std": 0.1219562292098999, + "rewards/progression_diversity/mean": -0.016525140032172203, + "rewards/progression_diversity/std": 0.0804808959364891, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9503580331802368, + "rewards/symbolic_reward_partial_score/std": 0.1865689754486084, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0606681108474731, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.4164552688598633, + "step": 1557 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.3193093240261078, + "epoch": 4.1, + "grad_norm": 0.008888328447937965, + "learning_rate": 1e-06, + "loss": 0.0658, + "step": 1558 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3196675330400467, + "epoch": 4.102631578947369, + "grad_norm": 0.007627990562468767, + "learning_rate": 1e-06, + "loss": 0.0829, + "step": 1559 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.31077609956264496, + "epoch": 4.105263157894737, + "grad_norm": 0.011663636192679405, + "learning_rate": 1e-06, + "loss": 0.1007, + "step": 1560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8140.0, + "completions/mean_length": 1069.8125, + "completions/mean_terminated_length": 607.6136474609375, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.34023210406303406, + "epoch": 4.1078947368421055, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.020510073751211166, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 521379756.0, + "reward": 0.8407922387123108, + "reward_std": 0.14206728339195251, + "rewards/progression_diversity/mean": -0.012575984001159668, + "rewards/progression_diversity/std": 0.07108542323112488, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9606119394302368, + "rewards/symbolic_reward_partial_score/std": 0.1664878875017166, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0578272342681885, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 350.0, + "sampling/sampling_logp_difference/mean": 2.6809022426605225, + "step": 1561 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.31028036773204803, + "epoch": 4.110526315789474, + "grad_norm": 0.0143095962703228, + "learning_rate": 1e-06, + "loss": 0.1249, + "step": 1562 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.30853691697120667, + "epoch": 4.113157894736842, + "grad_norm": 0.05083174258470535, + "learning_rate": 1e-06, + "loss": 0.1291, + "step": 1563 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.32772916555404663, + "epoch": 4.11578947368421, + "grad_norm": 0.020028606057167053, + "learning_rate": 1e-06, + "loss": 0.0466, + "step": 1564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15382.0, + "completions/mean_length": 1305.990234375, + "completions/mean_terminated_length": 693.06298828125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.30608853697776794, + "epoch": 4.118421052631579, + "frac_reward_zero_std": 0.4375, + "grad_norm": 67.87190246582031, + "learning_rate": 1e-06, + "loss": 0.0841, + "num_tokens": 522466247.0, + "reward": 0.8353767395019531, + "reward_std": 0.13427817821502686, + "rewards/progression_diversity/mean": -0.017013559117913246, + "rewards/progression_diversity/std": 0.08219697326421738, + "rewards/symbolic_reward_accuracy/mean": 0.921875, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.9505208134651184, + "rewards/symbolic_reward_partial_score/std": 0.19484204053878784, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0560702085494995, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.4243669509887695, + "step": 1565 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.29687653481960297, + "epoch": 4.121052631578947, + "grad_norm": 0.006236726883798838, + "learning_rate": 1e-06, + "loss": 0.1279, + "step": 1566 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3095232844352722, + "epoch": 4.123684210526315, + "grad_norm": 0.002965538529679179, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 1567 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.313509002327919, + "epoch": 4.126315789473685, + "grad_norm": 0.018090499565005302, + "learning_rate": 1e-06, + "loss": 0.0931, + "step": 1568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3464.0, + "completions/mean_length": 1349.125, + "completions/mean_terminated_length": 577.314208984375, + "completions/min_length": 246.0, + "completions/min_terminated_length": 246.0, + "entropy": 0.2996673285961151, + "epoch": 4.128947368421053, + "frac_reward_zero_std": 0.46875, + "grad_norm": 218.44454956054688, + "learning_rate": 1e-06, + "loss": 0.0891, + "num_tokens": 523526119.0, + "reward": 0.8193434476852417, + "reward_std": 0.13413912057876587, + "rewards/progression_diversity/mean": -0.018784930929541588, + "rewards/progression_diversity/std": 0.0878358706831932, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.9388021230697632, + "rewards/symbolic_reward_partial_score/std": 0.21514074504375458, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.054738998413086, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.5745482444763184, + "step": 1569 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.31027545034885406, + "epoch": 4.131578947368421, + "grad_norm": 0.004378543235361576, + "learning_rate": 1e-06, + "loss": 0.0894, + "step": 1570 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31161196529865265, + "epoch": 4.13421052631579, + "grad_norm": 0.005518985912203789, + "learning_rate": 1e-06, + "loss": 0.1057, + "step": 1571 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.322713166475296, + "epoch": 4.136842105263158, + "grad_norm": 0.005724793300032616, + "learning_rate": 1e-06, + "loss": 0.023, + "step": 1572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3315.0, + "completions/mean_length": 1232.330078125, + "completions/mean_terminated_length": 616.4085083007812, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.31801025569438934, + "epoch": 4.139473684210526, + "frac_reward_zero_std": 0.5, + "grad_norm": 114.2596664428711, + "learning_rate": 1e-06, + "loss": 0.0698, + "num_tokens": 524540528.0, + "reward": 0.8093826770782471, + "reward_std": 0.1396641731262207, + "rewards/progression_diversity/mean": -0.013879730366170406, + "rewards/progression_diversity/std": 0.07435765862464905, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9444987177848816, + "rewards/symbolic_reward_partial_score/std": 0.1908387839794159, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.054489254951477, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.705132484436035, + "step": 1573 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.31481173634529114, + "epoch": 4.1421052631578945, + "grad_norm": 0.011268100701272488, + "learning_rate": 1e-06, + "loss": 0.0299, + "step": 1574 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30692553520202637, + "epoch": 4.144736842105263, + "grad_norm": 0.013008923269808292, + "learning_rate": 1e-06, + "loss": 0.1172, + "step": 1575 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.30435749888420105, + "epoch": 4.147368421052631, + "grad_norm": 0.016314802691340446, + "learning_rate": 1e-06, + "loss": 0.1019, + "step": 1576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2267.0, + "completions/mean_length": 1044.396484375, + "completions/mean_terminated_length": 644.7675170898438, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.329788476228714, + "epoch": 4.15, + "frac_reward_zero_std": 0.46875, + "grad_norm": 0.022590087726712227, + "learning_rate": 1e-06, + "loss": 0.0132, + "num_tokens": 525490075.0, + "reward": 0.8446779847145081, + "reward_std": 0.13736554980278015, + "rewards/progression_diversity/mean": -0.009744609706103802, + "rewards/progression_diversity/std": 0.06334702670574188, + "rewards/symbolic_reward_accuracy/mean": 0.931640625, + "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, + "rewards/symbolic_reward_partial_score/mean": 0.9611002206802368, + "rewards/symbolic_reward_partial_score/std": 0.16516920924186707, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0595576763153076, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 1.8629345893859863, + "step": 1577 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.310794472694397, + "epoch": 4.152631578947369, + "grad_norm": 0.0059421774931252, + "learning_rate": 1e-06, + "loss": 0.0536, + "step": 1578 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.30790117383003235, + "epoch": 4.155263157894737, + "grad_norm": 1.5674898624420166, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 1579 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.31684084236621857, + "epoch": 4.157894736842105, + "grad_norm": 0.006289552431553602, + "learning_rate": 1e-06, + "loss": 0.049, + "step": 1580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2869.0, + "completions/mean_length": 1460.595703125, + "completions/mean_terminated_length": 726.6577758789062, + "completions/min_length": 254.0, + "completions/min_terminated_length": 254.0, + "entropy": 0.2980321794748306, + "epoch": 4.160526315789474, + "frac_reward_zero_std": 0.375, + "grad_norm": 215.46676635742188, + "learning_rate": 1e-06, + "loss": 0.0781, + "num_tokens": 526672588.0, + "reward": 0.8091208934783936, + "reward_std": 0.16659247875213623, + "rewards/progression_diversity/mean": -0.015641260892152786, + "rewards/progression_diversity/std": 0.07609428465366364, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9378255009651184, + "rewards/symbolic_reward_partial_score/std": 0.20248618721961975, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.047804594039917, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 3.248504400253296, + "step": 1581 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.28611473739147186, + "epoch": 4.163157894736842, + "grad_norm": 5.547037124633789, + "learning_rate": 1e-06, + "loss": 0.0783, + "step": 1582 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.3098125159740448, + "epoch": 4.16578947368421, + "grad_norm": 0.0198947936296463, + "learning_rate": 1e-06, + "loss": 0.0761, + "step": 1583 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3102129250764847, + "epoch": 4.168421052631579, + "grad_norm": 0.020427729934453964, + "learning_rate": 1e-06, + "loss": 0.0666, + "step": 1584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2539.0, + "completions/mean_length": 1271.625, + "completions/mean_terminated_length": 689.2008056640625, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.3241703659296036, + "epoch": 4.171052631578948, + "frac_reward_zero_std": 0.5, + "grad_norm": 19.84275245666504, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 527726124.0, + "reward": 0.8260881900787354, + "reward_std": 0.1229025349020958, + "rewards/progression_diversity/mean": -0.013252872042357922, + "rewards/progression_diversity/std": 0.07066137343645096, + "rewards/symbolic_reward_accuracy/mean": 0.908203125, + "rewards/symbolic_reward_accuracy/std": 0.289021372795105, + "rewards/symbolic_reward_partial_score/mean": 0.94482421875, + "rewards/symbolic_reward_partial_score/std": 0.19333821535110474, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0540461540222168, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 2.846632480621338, + "step": 1585 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.30269454419612885, + "epoch": 4.173684210526316, + "grad_norm": 1.6151353120803833, + "learning_rate": 1e-06, + "loss": 0.0914, + "step": 1586 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.32799434661865234, + "epoch": 4.176315789473684, + "grad_norm": 0.03124081902205944, + "learning_rate": 1e-06, + "loss": 0.007, + "step": 1587 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.3005277067422867, + "epoch": 4.178947368421053, + "grad_norm": 0.007625295780599117, + "learning_rate": 1e-06, + "loss": 0.1777, + "step": 1588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.06640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2949.0, + "completions/mean_length": 1715.181640625, + "completions/mean_terminated_length": 671.7928466796875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.30243825912475586, + "epoch": 4.181578947368421, + "frac_reward_zero_std": 0.28125, + "grad_norm": 418.0889587402344, + "learning_rate": 1e-06, + "loss": 0.1181, + "num_tokens": 528990761.0, + "reward": 0.7599914073944092, + "reward_std": 0.15093812346458435, + "rewards/progression_diversity/mean": -0.021372389048337936, + "rewards/progression_diversity/std": 0.0902642160654068, + "rewards/symbolic_reward_accuracy/mean": 0.826171875, + "rewards/symbolic_reward_accuracy/std": 0.3793322443962097, + "rewards/symbolic_reward_partial_score/mean": 0.8992512822151184, + "rewards/symbolic_reward_partial_score/std": 0.2561655044555664, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0439589023590088, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 348.0, + "sampling/sampling_logp_difference/mean": 3.9997878074645996, + "step": 1589 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.4296875, + "entropy": 0.29131756722927094, + "epoch": 4.184210526315789, + "grad_norm": 0.009967650286853313, + "learning_rate": 1e-06, + "loss": 0.1418, + "step": 1590 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.288121297955513, + "epoch": 4.186842105263158, + "grad_norm": 0.011154073290526867, + "learning_rate": 1e-06, + "loss": 0.1733, + "step": 1591 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.32074061036109924, + "epoch": 4.189473684210526, + "grad_norm": 0.015442883595824242, + "learning_rate": 1e-06, + "loss": 0.057, + "step": 1592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3200.0, + "completions/mean_length": 1718.6875, + "completions/mean_terminated_length": 741.0000610351562, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.3073541969060898, + "epoch": 4.192105263157894, + "frac_reward_zero_std": 0.5625, + "grad_norm": 62.47317123413086, + "learning_rate": 1e-06, + "loss": 0.0488, + "num_tokens": 530279049.0, + "reward": 0.776131272315979, + "reward_std": 0.11894486844539642, + "rewards/progression_diversity/mean": -0.01871039718389511, + "rewards/progression_diversity/std": 0.07976720482110977, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.9021810293197632, + "rewards/symbolic_reward_partial_score/std": 0.25698360800743103, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0540015697479248, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 346.0, + "sampling/sampling_logp_difference/mean": 2.4201743602752686, + "step": 1593 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.29900647699832916, + "epoch": 4.1947368421052635, + "grad_norm": 103.9713363647461, + "learning_rate": 1e-06, + "loss": 0.0922, + "step": 1594 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.31466494500637054, + "epoch": 4.197368421052632, + "grad_norm": 0.0200185626745224, + "learning_rate": 1e-06, + "loss": 0.0658, + "step": 1595 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.078125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.30266304314136505, + "epoch": 4.2, + "grad_norm": 0.011375721544027328, + "learning_rate": 1e-06, + "loss": 0.0875, + "step": 1596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3062.0, + "completions/mean_length": 1656.662109375, + "completions/mean_terminated_length": 740.0228881835938, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.3035944253206253, + "epoch": 4.2026315789473685, + "frac_reward_zero_std": 0.3125, + "grad_norm": 208.2892608642578, + "learning_rate": 1e-06, + "loss": 0.0753, + "num_tokens": 531551420.0, + "reward": 0.7731266021728516, + "reward_std": 0.16756606101989746, + "rewards/progression_diversity/mean": -0.021324899047613144, + "rewards/progression_diversity/std": 0.08820579946041107, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.9150390625, + "rewards/symbolic_reward_partial_score/std": 0.22819702327251434, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0414807796478271, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 350.0, + "sampling/sampling_logp_difference/mean": 3.9151625633239746, + "step": 1597 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.3109179735183716, + "epoch": 4.205263157894737, + "grad_norm": 0.0169313196092844, + "learning_rate": 1e-06, + "loss": 0.1055, + "step": 1598 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.31678929924964905, + "epoch": 4.207894736842105, + "grad_norm": 0.07289431989192963, + "learning_rate": 1e-06, + "loss": 0.0648, + "step": 1599 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2846733182668686, + "epoch": 4.2105263157894735, + "grad_norm": 0.020705915987491608, + "learning_rate": 1e-06, + "loss": 0.1821, + "step": 1600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.10546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4576.0, + "completions/mean_length": 2429.921875, + "completions/mean_terminated_length": 784.6812133789062, + "completions/min_length": 261.0, + "completions/min_terminated_length": 261.0, + "entropy": 0.2895173579454422, + "epoch": 4.213157894736842, + "frac_reward_zero_std": 0.40625, + "grad_norm": 806.2984619140625, + "learning_rate": 1e-06, + "loss": 0.1064, + "num_tokens": 533202356.0, + "reward": 0.7311378717422485, + "reward_std": 0.14681148529052734, + "rewards/progression_diversity/mean": -0.030749481171369553, + "rewards/progression_diversity/std": 0.10168987512588501, + "rewards/symbolic_reward_accuracy/mean": 0.794921875, + "rewards/symbolic_reward_accuracy/std": 0.4041535556316376, + "rewards/symbolic_reward_partial_score/mean": 0.876953125, + "rewards/symbolic_reward_partial_score/std": 0.2930140495300293, + "rewards/tag_count_reward/mean": -0.0859375, + "rewards/tag_count_reward/std": 0.28054583072662354, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0381789207458496, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 352.0, + "sampling/sampling_logp_difference/mean": 4.134341239929199, + "step": 1601 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.28859490156173706, + "epoch": 4.215789473684211, + "grad_norm": 0.0084293307736516, + "learning_rate": 1e-06, + "loss": 0.0704, + "step": 1602 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2843547612428665, + "epoch": 4.218421052631579, + "grad_norm": 0.0123353386297822, + "learning_rate": 1e-06, + "loss": 0.1586, + "step": 1603 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.140625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.27717554569244385, + "epoch": 4.221052631578948, + "grad_norm": 0.020575549453496933, + "learning_rate": 1e-06, + "loss": 0.1076, + "step": 1604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3167.0, + "completions/mean_length": 1855.916015625, + "completions/mean_terminated_length": 757.1533813476562, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "entropy": 0.2824231684207916, + "epoch": 4.223684210526316, + "frac_reward_zero_std": 0.34375, + "grad_norm": 216.76248168945312, + "learning_rate": 1e-06, + "loss": 0.1544, + "num_tokens": 534562409.0, + "reward": 0.7700595259666443, + "reward_std": 0.1850237250328064, + "rewards/progression_diversity/mean": -0.025300273671746254, + "rewards/progression_diversity/std": 0.09943580627441406, + "rewards/symbolic_reward_accuracy/mean": 0.83984375, + "rewards/symbolic_reward_accuracy/std": 0.3671095669269562, + "rewards/symbolic_reward_partial_score/mean": 0.908203125, + "rewards/symbolic_reward_partial_score/std": 0.24698437750339508, + "rewards/tag_count_reward/mean": -0.060546875, + "rewards/tag_count_reward/std": 0.2387305200099945, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0366488695144653, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 356.0, + "sampling/sampling_logp_difference/mean": 4.752798080444336, + "step": 1605 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.30687089264392853, + "epoch": 4.226315789473684, + "grad_norm": 0.03631648048758507, + "learning_rate": 1e-06, + "loss": 0.0767, + "step": 1606 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2971319705247879, + "epoch": 4.228947368421053, + "grad_norm": 0.016691574826836586, + "learning_rate": 1e-06, + "loss": 0.1243, + "step": 1607 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.29933421313762665, + "epoch": 4.231578947368421, + "grad_norm": 0.024404583498835564, + "learning_rate": 1e-06, + "loss": 0.0898, + "step": 1608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2285.0, + "completions/mean_length": 1166.900390625, + "completions/mean_terminated_length": 644.29296875, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.3166111558675766, + "epoch": 4.234210526315789, + "frac_reward_zero_std": 0.53125, + "grad_norm": 546.3336791992188, + "learning_rate": 1e-06, + "loss": 0.0718, + "num_tokens": 535559318.0, + "reward": 0.8583120107650757, + "reward_std": 0.08758790791034698, + "rewards/progression_diversity/mean": -0.013530386611819267, + "rewards/progression_diversity/std": 0.0759081244468689, + "rewards/symbolic_reward_accuracy/mean": 0.94921875, + "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, + "rewards/symbolic_reward_partial_score/mean": 0.9734700918197632, + "rewards/symbolic_reward_partial_score/std": 0.13808730244636536, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.050557017326355, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 356.0, + "sampling/sampling_logp_difference/mean": 3.118114471435547, + "step": 1609 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.31313398480415344, + "epoch": 4.2368421052631575, + "grad_norm": 0.01867660880088806, + "learning_rate": 1e-06, + "loss": 0.1433, + "step": 1610 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.31812839210033417, + "epoch": 4.239473684210527, + "grad_norm": 0.00977341365069151, + "learning_rate": 1e-06, + "loss": 0.0707, + "step": 1611 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.32100366055965424, + "epoch": 4.242105263157895, + "grad_norm": 0.006353206932544708, + "learning_rate": 1e-06, + "loss": 0.0522, + "step": 1612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.046875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2459.0, + "completions/mean_length": 1447.01953125, + "completions/mean_terminated_length": 712.4138793945312, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.286122590303421, + "epoch": 4.244736842105263, + "frac_reward_zero_std": 0.3125, + "grad_norm": 307.3343200683594, + "learning_rate": 1e-06, + "loss": 0.1766, + "num_tokens": 536710016.0, + "reward": 0.7893308401107788, + "reward_std": 0.1426737904548645, + "rewards/progression_diversity/mean": -0.017116881906986237, + "rewards/progression_diversity/std": 0.0786111056804657, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.9259439706802368, + "rewards/symbolic_reward_partial_score/std": 0.20985758304595947, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0420353412628174, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 358.0, + "sampling/sampling_logp_difference/mean": 4.076284408569336, + "step": 1613 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.29413002729415894, + "epoch": 4.247368421052632, + "grad_norm": 0.011863662861287594, + "learning_rate": 1e-06, + "loss": 0.1084, + "step": 1614 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3984375, + "entropy": 0.32583458721637726, + "epoch": 4.25, + "grad_norm": 0.009227105416357517, + "learning_rate": 1e-06, + "loss": 0.0131, + "step": 1615 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.30109357833862305, + "epoch": 4.252631578947368, + "grad_norm": 0.08455037325620651, + "learning_rate": 1e-06, + "loss": 0.0389, + "step": 1616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3129.0, + "completions/mean_length": 1303.62109375, + "completions/mean_terminated_length": 690.5975341796875, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.3038036525249481, + "epoch": 4.255263157894737, + "frac_reward_zero_std": 0.46875, + "grad_norm": 244.20498657226562, + "learning_rate": 1e-06, + "loss": 0.1133, + "num_tokens": 537764254.0, + "reward": 0.804751992225647, + "reward_std": 0.1407267451286316, + "rewards/progression_diversity/mean": -0.01308908686041832, + "rewards/progression_diversity/std": 0.06904362142086029, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.9322916269302368, + "rewards/symbolic_reward_partial_score/std": 0.21590134501457214, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.049241065979004, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 360.0, + "sampling/sampling_logp_difference/mean": 3.09298038482666, + "step": 1617 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.31994016468524933, + "epoch": 4.257894736842105, + "grad_norm": 0.010734348557889462, + "learning_rate": 1e-06, + "loss": 0.0265, + "step": 1618 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.30486488342285156, + "epoch": 4.260526315789473, + "grad_norm": 0.010809490457177162, + "learning_rate": 1e-06, + "loss": 0.0587, + "step": 1619 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2901280075311661, + "epoch": 4.2631578947368425, + "grad_norm": 0.03101206384599209, + "learning_rate": 1e-06, + "loss": 0.086, + "step": 1620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3629.0, + "completions/mean_length": 1185.048828125, + "completions/mean_terminated_length": 663.064697265625, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.3062327206134796, + "epoch": 4.265789473684211, + "frac_reward_zero_std": 0.34375, + "grad_norm": 202.47483825683594, + "learning_rate": 1e-06, + "loss": 0.0557, + "num_tokens": 538737751.0, + "reward": 0.8110127449035645, + "reward_std": 0.16020728647708893, + "rewards/progression_diversity/mean": -0.012012619525194168, + "rewards/progression_diversity/std": 0.06591516733169556, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9381510019302368, + "rewards/symbolic_reward_partial_score/std": 0.19927191734313965, + "rewards/tag_count_reward/mean": -0.0234375, + "rewards/tag_count_reward/std": 0.15143637359142303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.049810528755188, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 360.0, + "sampling/sampling_logp_difference/mean": 3.169362783432007, + "step": 1621 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.2987634390592575, + "epoch": 4.268421052631579, + "grad_norm": 141.50709533691406, + "learning_rate": 1e-06, + "loss": 0.1229, + "step": 1622 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.3115040361881256, + "epoch": 4.271052631578947, + "grad_norm": 0.011638534255325794, + "learning_rate": 1e-06, + "loss": 0.0587, + "step": 1623 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.34375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3828125, + "entropy": 0.31850266456604004, + "epoch": 4.273684210526316, + "grad_norm": 0.00664818100631237, + "learning_rate": 1e-06, + "loss": 0.0181, + "step": 1624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2810.0, + "completions/mean_length": 1301.916015625, + "completions/mean_terminated_length": 752.3663940429688, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.3117539584636688, + "epoch": 4.276315789473684, + "frac_reward_zero_std": 0.34375, + "grad_norm": 244.9304962158203, + "learning_rate": 1e-06, + "loss": 0.0613, + "num_tokens": 539834028.0, + "reward": 0.8020786046981812, + "reward_std": 0.16100260615348816, + "rewards/progression_diversity/mean": -0.011866888031363487, + "rewards/progression_diversity/std": 0.06344626098871231, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.93896484375, + "rewards/symbolic_reward_partial_score/std": 0.1891382783651352, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0457390546798706, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 364.0, + "sampling/sampling_logp_difference/mean": 3.5810694694519043, + "step": 1625 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2959267497062683, + "epoch": 4.278947368421052, + "grad_norm": 0.020144561305642128, + "learning_rate": 1e-06, + "loss": 0.1104, + "step": 1626 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3018384575843811, + "epoch": 4.281578947368421, + "grad_norm": 0.014604386873543262, + "learning_rate": 1e-06, + "loss": 0.0735, + "step": 1627 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.30800075829029083, + "epoch": 4.284210526315789, + "grad_norm": 0.014517586678266525, + "learning_rate": 1e-06, + "loss": 0.0251, + "step": 1628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3842.0, + "completions/mean_length": 956.640625, + "completions/mean_terminated_length": 711.761962890625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "entropy": 0.3077455312013626, + "epoch": 4.286842105263158, + "frac_reward_zero_std": 0.5625, + "grad_norm": 398.7921447753906, + "learning_rate": 1e-06, + "loss": 0.0327, + "num_tokens": 540715284.0, + "reward": 0.847118616104126, + "reward_std": 0.09315143525600433, + "rewards/progression_diversity/mean": -0.004935206845402718, + "rewards/progression_diversity/std": 0.043800655752420425, + "rewards/symbolic_reward_accuracy/mean": 0.9296875, + "rewards/symbolic_reward_accuracy/std": 0.25592297315597534, + "rewards/symbolic_reward_partial_score/mean": 0.9690755605697632, + "rewards/symbolic_reward_partial_score/std": 0.13879333436489105, + "rewards/tag_count_reward/mean": -0.013671875, + "rewards/tag_count_reward/std": 0.1162383034825325, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.059929609298706, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 364.0, + "sampling/sampling_logp_difference/mean": 1.7687201499938965, + "step": 1629 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3137192130088806, + "epoch": 4.2894736842105265, + "grad_norm": 0.01701788231730461, + "learning_rate": 1e-06, + "loss": 0.0515, + "step": 1630 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.32366204261779785, + "epoch": 4.292105263157895, + "grad_norm": 0.03292662650346756, + "learning_rate": 1e-06, + "loss": 0.0018, + "step": 1631 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3094526529312134, + "epoch": 4.294736842105263, + "grad_norm": 0.02371911332011223, + "learning_rate": 1e-06, + "loss": 0.064, + "step": 1632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4493.0, + "completions/mean_length": 1390.392578125, + "completions/mean_terminated_length": 717.2101440429688, + "completions/min_length": 230.0, + "completions/min_terminated_length": 230.0, + "entropy": 0.30759990215301514, + "epoch": 4.2973684210526315, + "frac_reward_zero_std": 0.59375, + "grad_norm": 396.1076965332031, + "learning_rate": 1e-06, + "loss": 0.082, + "num_tokens": 541829469.0, + "reward": 0.7947248220443726, + "reward_std": 0.0687393993139267, + "rewards/progression_diversity/mean": -0.009939271956682205, + "rewards/progression_diversity/std": 0.055136967450380325, + "rewards/symbolic_reward_accuracy/mean": 0.8671875, + "rewards/symbolic_reward_accuracy/std": 0.33970388770103455, + "rewards/symbolic_reward_partial_score/mean": 0.9267578125, + "rewards/symbolic_reward_partial_score/std": 0.2075078934431076, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.054058313369751, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 364.0, + "sampling/sampling_logp_difference/mean": 2.6795334815979004, + "step": 1633 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3071286678314209, + "epoch": 4.3, + "grad_norm": 0.010865924879908562, + "learning_rate": 1e-06, + "loss": 0.0643, + "step": 1634 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.31138360500335693, + "epoch": 4.302631578947368, + "grad_norm": 0.012006482109427452, + "learning_rate": 1e-06, + "loss": 0.0281, + "step": 1635 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3028770089149475, + "epoch": 4.3052631578947365, + "grad_norm": 0.30845019221305847, + "learning_rate": 1e-06, + "loss": 0.0632, + "step": 1636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.017578125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2767.0, + "completions/mean_length": 868.23828125, + "completions/mean_terminated_length": 590.6202392578125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.3321658670902252, + "epoch": 4.307894736842106, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.008871855214238167, + "learning_rate": 1e-06, + "loss": 0.0084, + "num_tokens": 542626583.0, + "reward": 0.8462392091751099, + "reward_std": 0.11010201275348663, + "rewards/progression_diversity/mean": -0.0049893660470843315, + "rewards/progression_diversity/std": 0.04094173386693001, + "rewards/symbolic_reward_accuracy/mean": 0.927734375, + "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, + "rewards/symbolic_reward_partial_score/mean": 0.970703125, + "rewards/symbolic_reward_partial_score/std": 0.12699371576309204, + "rewards/tag_count_reward/mean": -0.015625, + "rewards/tag_count_reward/std": 0.12414088100194931, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0612621307373047, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 366.0, + "sampling/sampling_logp_difference/mean": 2.047287940979004, + "step": 1637 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.3181557357311249, + "epoch": 4.310526315789474, + "grad_norm": 0.023050582036376, + "learning_rate": 1e-06, + "loss": 0.0581, + "step": 1638 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3208106458187103, + "epoch": 4.313157894736842, + "grad_norm": 0.01196917425841093, + "learning_rate": 1e-06, + "loss": 0.0703, + "step": 1639 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.3326588422060013, + "epoch": 4.315789473684211, + "grad_norm": 0.011661848984658718, + "learning_rate": 1e-06, + "loss": -0.0092, + "step": 1640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2995.0, + "completions/mean_length": 1302.34765625, + "completions/mean_terminated_length": 752.8137817382812, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.2890154719352722, + "epoch": 4.318421052631579, + "frac_reward_zero_std": 0.40625, + "grad_norm": 436.7637939453125, + "learning_rate": 1e-06, + "loss": 0.1363, + "num_tokens": 543714537.0, + "reward": 0.8115625977516174, + "reward_std": 0.11080306768417358, + "rewards/progression_diversity/mean": -0.010732462629675865, + "rewards/progression_diversity/std": 0.062084443867206573, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9412435293197632, + "rewards/symbolic_reward_partial_score/std": 0.18950168788433075, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.048242211341858, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 366.0, + "sampling/sampling_logp_difference/mean": 3.6092000007629395, + "step": 1641 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2962608188390732, + "epoch": 4.321052631578947, + "grad_norm": 0.027114521712064743, + "learning_rate": 1e-06, + "loss": 0.0896, + "step": 1642 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.3004533350467682, + "epoch": 4.323684210526316, + "grad_norm": 0.021577974781394005, + "learning_rate": 1e-06, + "loss": 0.0285, + "step": 1643 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.30283403396606445, + "epoch": 4.326315789473684, + "grad_norm": 0.01051260158419609, + "learning_rate": 1e-06, + "loss": 0.0431, + "step": 1644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3053.0, + "completions/mean_length": 1617.533203125, + "completions/mean_terminated_length": 730.9337768554688, + "completions/min_length": 253.0, + "completions/min_terminated_length": 253.0, + "entropy": 0.28192102909088135, + "epoch": 4.328947368421053, + "frac_reward_zero_std": 0.40625, + "grad_norm": 224.36424255371094, + "learning_rate": 1e-06, + "loss": 0.0977, + "num_tokens": 544947706.0, + "reward": 0.7883968949317932, + "reward_std": 0.1514188051223755, + "rewards/progression_diversity/mean": -0.017739124596118927, + "rewards/progression_diversity/std": 0.07712989300489426, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.9241536259651184, + "rewards/symbolic_reward_partial_score/std": 0.2255030870437622, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0469114780426025, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 3.760016441345215, + "step": 1645 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.30050475895404816, + "epoch": 4.331578947368421, + "grad_norm": 0.010516730137169361, + "learning_rate": 1e-06, + "loss": 0.0643, + "step": 1646 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.29950810968875885, + "epoch": 4.33421052631579, + "grad_norm": 4.544126510620117, + "learning_rate": 1e-06, + "loss": 0.0271, + "step": 1647 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.29230254888534546, + "epoch": 4.336842105263158, + "grad_norm": 4.5623393058776855, + "learning_rate": 1e-06, + "loss": 0.0658, + "step": 1648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3712.0, + "completions/mean_length": 1151.015625, + "completions/mean_terminated_length": 659.6290283203125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.31469690799713135, + "epoch": 4.339473684210526, + "frac_reward_zero_std": 0.53125, + "grad_norm": 301.08685302734375, + "learning_rate": 1e-06, + "loss": 0.0391, + "num_tokens": 545939170.0, + "reward": 0.8617491722106934, + "reward_std": 0.09493400156497955, + "rewards/progression_diversity/mean": -0.011606581509113312, + "rewards/progression_diversity/std": 0.06665636599063873, + "rewards/symbolic_reward_accuracy/mean": 0.94921875, + "rewards/symbolic_reward_accuracy/std": 0.21976542472839355, + "rewards/symbolic_reward_partial_score/mean": 0.9777017831802368, + "rewards/symbolic_reward_partial_score/std": 0.12183935195207596, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0512254238128662, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 366.0, + "sampling/sampling_logp_difference/mean": 3.1432154178619385, + "step": 1649 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.30177412927150726, + "epoch": 4.342105263157895, + "grad_norm": 318.8957214355469, + "learning_rate": 1e-06, + "loss": 0.0878, + "step": 1650 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2994520217180252, + "epoch": 4.344736842105263, + "grad_norm": 0.24904422461986542, + "learning_rate": 1e-06, + "loss": 0.0613, + "step": 1651 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3131738752126694, + "epoch": 4.347368421052631, + "grad_norm": 0.016566958278417587, + "learning_rate": 1e-06, + "loss": 0.0589, + "step": 1652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2135.0, + "completions/mean_length": 1417.859375, + "completions/mean_terminated_length": 649.5770263671875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.2909892499446869, + "epoch": 4.35, + "frac_reward_zero_std": 0.5, + "grad_norm": 281.6248474121094, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 547068762.0, + "reward": 0.8414824604988098, + "reward_std": 0.0903344452381134, + "rewards/progression_diversity/mean": -0.016799690201878548, + "rewards/progression_diversity/std": 0.07866127043962479, + "rewards/symbolic_reward_accuracy/mean": 0.921875, + "rewards/symbolic_reward_accuracy/std": 0.26863065361976624, + "rewards/symbolic_reward_partial_score/mean": 0.9656575322151184, + "rewards/symbolic_reward_partial_score/std": 0.1427777260541916, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0441806316375732, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 4.0731892585754395, + "step": 1653 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.29775403439998627, + "epoch": 4.352631578947369, + "grad_norm": 1.3435653448104858, + "learning_rate": 1e-06, + "loss": 0.0371, + "step": 1654 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.27768969535827637, + "epoch": 4.355263157894737, + "grad_norm": 0.009355852380394936, + "learning_rate": 1e-06, + "loss": 0.1486, + "step": 1655 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3120851516723633, + "epoch": 4.3578947368421055, + "grad_norm": 0.5682896375656128, + "learning_rate": 1e-06, + "loss": 0.0239, + "step": 1656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3438.0, + "completions/mean_length": 1578.765625, + "completions/mean_terminated_length": 722.264404296875, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.2924637943506241, + "epoch": 4.360526315789474, + "frac_reward_zero_std": 0.375, + "grad_norm": 165.94813537597656, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 548296738.0, + "reward": 0.8284804821014404, + "reward_std": 0.11700120568275452, + "rewards/progression_diversity/mean": -0.018165672197937965, + "rewards/progression_diversity/std": 0.07916979491710663, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9549153447151184, + "rewards/symbolic_reward_partial_score/std": 0.1701822131872177, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.041003704071045, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 366.0, + "sampling/sampling_logp_difference/mean": 4.021053314208984, + "step": 1657 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2971417158842087, + "epoch": 4.363157894736842, + "grad_norm": 0.03589131683111191, + "learning_rate": 1e-06, + "loss": 0.082, + "step": 1658 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.28301428258419037, + "epoch": 4.36578947368421, + "grad_norm": 0.021585499867796898, + "learning_rate": 1e-06, + "loss": 0.1136, + "step": 1659 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.28591735661029816, + "epoch": 4.368421052631579, + "grad_norm": 0.006448809057474136, + "learning_rate": 1e-06, + "loss": 0.131, + "step": 1660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2655.0, + "completions/mean_length": 1222.650390625, + "completions/mean_terminated_length": 701.9575805664062, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.29014408588409424, + "epoch": 4.371052631578947, + "frac_reward_zero_std": 0.4375, + "grad_norm": 705.5008544921875, + "learning_rate": 1e-06, + "loss": 0.0896, + "num_tokens": 549336047.0, + "reward": 0.8229233026504517, + "reward_std": 0.11057721078395844, + "rewards/progression_diversity/mean": -0.012355408631265163, + "rewards/progression_diversity/std": 0.06910999119281769, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.9622395634651184, + "rewards/symbolic_reward_partial_score/std": 0.1456439197063446, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0430617332458496, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 366.0, + "sampling/sampling_logp_difference/mean": 3.94387149810791, + "step": 1661 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2902925908565521, + "epoch": 4.373684210526315, + "grad_norm": 184.36441040039062, + "learning_rate": 1e-06, + "loss": 0.1117, + "step": 1662 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2993343472480774, + "epoch": 4.376315789473685, + "grad_norm": 0.006076624616980553, + "learning_rate": 1e-06, + "loss": 0.0502, + "step": 1663 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.30178727209568024, + "epoch": 4.378947368421053, + "grad_norm": 0.00712067075073719, + "learning_rate": 1e-06, + "loss": 0.0184, + "step": 1664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3090.0, + "completions/mean_length": 1699.12109375, + "completions/mean_terminated_length": 752.6943969726562, + "completions/min_length": 250.0, + "completions/min_terminated_length": 250.0, + "entropy": 0.2760689854621887, + "epoch": 4.381578947368421, + "frac_reward_zero_std": 0.34375, + "grad_norm": 577.8743896484375, + "learning_rate": 1e-06, + "loss": 0.0909, + "num_tokens": 550629997.0, + "reward": 0.8087052702903748, + "reward_std": 0.1341739296913147, + "rewards/progression_diversity/mean": -0.018151385709643364, + "rewards/progression_diversity/std": 0.07787839323282242, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.9462890625, + "rewards/symbolic_reward_partial_score/std": 0.18004465103149414, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0394903421401978, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 364.0, + "sampling/sampling_logp_difference/mean": 4.506129741668701, + "step": 1665 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.26462263613939285, + "epoch": 4.38421052631579, + "grad_norm": 1588.7640380859375, + "learning_rate": 1e-06, + "loss": 0.3391, + "step": 1666 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.29205040633678436, + "epoch": 4.386842105263158, + "grad_norm": 0.009880495257675648, + "learning_rate": 1e-06, + "loss": 0.0564, + "step": 1667 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.28884167969226837, + "epoch": 4.389473684210526, + "grad_norm": 0.010468707419931889, + "learning_rate": 1e-06, + "loss": 0.0718, + "step": 1668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3253.0, + "completions/mean_length": 1559.23828125, + "completions/mean_terminated_length": 733.9423217773438, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "entropy": 0.26679860055446625, + "epoch": 4.3921052631578945, + "frac_reward_zero_std": 0.4375, + "grad_norm": 465.1615295410156, + "learning_rate": 1e-06, + "loss": 0.1201, + "num_tokens": 551842663.0, + "reward": 0.7915906310081482, + "reward_std": 0.1362362802028656, + "rewards/progression_diversity/mean": -0.015742970630526543, + "rewards/progression_diversity/std": 0.07118507474660873, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.9275715947151184, + "rewards/symbolic_reward_partial_score/std": 0.2216237634420395, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0484164953231812, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 366.0, + "sampling/sampling_logp_difference/mean": 3.136202096939087, + "step": 1669 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2917633354663849, + "epoch": 4.394736842105263, + "grad_norm": 0.009237710386514664, + "learning_rate": 1e-06, + "loss": 0.0597, + "step": 1670 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.29464130103588104, + "epoch": 4.397368421052631, + "grad_norm": 0.020612401887774467, + "learning_rate": 1e-06, + "loss": 0.0545, + "step": 1671 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.29814429581165314, + "epoch": 4.4, + "grad_norm": 0.023844098672270775, + "learning_rate": 1e-06, + "loss": 0.0525, + "step": 1672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3713.0, + "completions/mean_length": 1361.876953125, + "completions/mean_terminated_length": 655.31494140625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.29356473684310913, + "epoch": 4.402631578947369, + "frac_reward_zero_std": 0.375, + "grad_norm": 48.91466522216797, + "learning_rate": 1e-06, + "loss": 0.0294, + "num_tokens": 552934888.0, + "reward": 0.8409208655357361, + "reward_std": 0.11717473715543747, + "rewards/progression_diversity/mean": -0.01435843575745821, + "rewards/progression_diversity/std": 0.06984733790159225, + "rewards/symbolic_reward_accuracy/mean": 0.927734375, + "rewards/symbolic_reward_accuracy/std": 0.2591804563999176, + "rewards/symbolic_reward_partial_score/mean": 0.95849609375, + "rewards/symbolic_reward_partial_score/std": 0.1762569397687912, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0481362342834473, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 366.0, + "sampling/sampling_logp_difference/mean": 3.0723586082458496, + "step": 1673 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.28971320390701294, + "epoch": 4.405263157894737, + "grad_norm": 17.948726654052734, + "learning_rate": 1e-06, + "loss": 0.1098, + "step": 1674 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.29446932673454285, + "epoch": 4.407894736842105, + "grad_norm": 0.010464577004313469, + "learning_rate": 1e-06, + "loss": 0.0555, + "step": 1675 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.28618983924388885, + "epoch": 4.410526315789474, + "grad_norm": 0.010811169631779194, + "learning_rate": 1e-06, + "loss": 0.1078, + "step": 1676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2688.0, + "completions/mean_length": 1110.826171875, + "completions/mean_terminated_length": 649.8651733398438, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.29554495215415955, + "epoch": 4.413157894736842, + "frac_reward_zero_std": 0.3125, + "grad_norm": 467.7513427734375, + "learning_rate": 1e-06, + "loss": 0.1092, + "num_tokens": 553879247.0, + "reward": 0.8643869161605835, + "reward_std": 0.11433817446231842, + "rewards/progression_diversity/mean": -0.011511346325278282, + "rewards/progression_diversity/std": 0.06805232167243958, + "rewards/symbolic_reward_accuracy/mean": 0.953125, + "rewards/symbolic_reward_accuracy/std": 0.21157780289649963, + "rewards/symbolic_reward_partial_score/mean": 0.9793294668197632, + "rewards/symbolic_reward_partial_score/std": 0.1068115308880806, + "rewards/tag_count_reward/mean": -0.01171875, + "rewards/tag_count_reward/std": 0.10772226005792618, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0465220212936401, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 3.3040924072265625, + "step": 1677 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.3359375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.2996697723865509, + "epoch": 4.41578947368421, + "grad_norm": 0.005097354296594858, + "learning_rate": 1e-06, + "loss": 0.0766, + "step": 1678 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.30967849493026733, + "epoch": 4.418421052631579, + "grad_norm": 0.008583194576203823, + "learning_rate": 1e-06, + "loss": 0.0206, + "step": 1679 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2984393239021301, + "epoch": 4.421052631578947, + "grad_norm": 0.020960450172424316, + "learning_rate": 1e-06, + "loss": 0.0607, + "step": 1680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.05859375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3201.0, + "completions/mean_length": 1718.21875, + "completions/mean_terminated_length": 805.4108276367188, + "completions/min_length": 252.0, + "completions/min_terminated_length": 252.0, + "entropy": 0.2852473855018616, + "epoch": 4.423684210526316, + "frac_reward_zero_std": 0.46875, + "grad_norm": 566.0900268554688, + "learning_rate": 1e-06, + "loss": 0.1011, + "num_tokens": 555183295.0, + "reward": 0.8038582801818848, + "reward_std": 0.10909964144229889, + "rewards/progression_diversity/mean": -0.019448932260274887, + "rewards/progression_diversity/std": 0.08251741528511047, + "rewards/symbolic_reward_accuracy/mean": 0.87890625, + "rewards/symbolic_reward_accuracy/std": 0.3265552520751953, + "rewards/symbolic_reward_partial_score/mean": 0.93798828125, + "rewards/symbolic_reward_partial_score/std": 0.2074093222618103, + "rewards/tag_count_reward/mean": -0.046875, + "rewards/tag_count_reward/std": 0.21157780289649963, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0440826416015625, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 3.4236390590667725, + "step": 1681 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2848607301712036, + "epoch": 4.426315789473684, + "grad_norm": 0.014388111419975758, + "learning_rate": 1e-06, + "loss": 0.0661, + "step": 1682 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.279374361038208, + "epoch": 4.428947368421053, + "grad_norm": 0.014383941888809204, + "learning_rate": 1e-06, + "loss": 0.0695, + "step": 1683 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2949568033218384, + "epoch": 4.431578947368421, + "grad_norm": 0.015598650090396404, + "learning_rate": 1e-06, + "loss": 0.0831, + "step": 1684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 15587.0, + "completions/mean_length": 1525.705078125, + "completions/mean_terminated_length": 762.958984375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.300376757979393, + "epoch": 4.434210526315789, + "frac_reward_zero_std": 0.3125, + "grad_norm": 411.582275390625, + "learning_rate": 1e-06, + "loss": 0.0576, + "num_tokens": 556383464.0, + "reward": 0.8135613203048706, + "reward_std": 0.15236347913742065, + "rewards/progression_diversity/mean": -0.015942061319947243, + "rewards/progression_diversity/std": 0.07582908123731613, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.9415689706802368, + "rewards/symbolic_reward_partial_score/std": 0.20094631612300873, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0429538488388062, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 3.6773757934570312, + "step": 1685 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3203125, + "entropy": 0.2941613495349884, + "epoch": 4.436842105263158, + "grad_norm": 0.00848439708352089, + "learning_rate": 1e-06, + "loss": 0.0689, + "step": 1686 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2935366630554199, + "epoch": 4.439473684210526, + "grad_norm": 6.972188949584961, + "learning_rate": 1e-06, + "loss": 0.0504, + "step": 1687 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2752288281917572, + "epoch": 4.442105263157894, + "grad_norm": 0.005782074760645628, + "learning_rate": 1e-06, + "loss": 0.1494, + "step": 1688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2501.0, + "completions/mean_length": 1201.7578125, + "completions/mean_terminated_length": 743.5411987304688, + "completions/min_length": 225.0, + "completions/min_terminated_length": 225.0, + "entropy": 0.29591110348701477, + "epoch": 4.4447368421052635, + "frac_reward_zero_std": 0.375, + "grad_norm": 890.4512939453125, + "learning_rate": 1e-06, + "loss": 0.0993, + "num_tokens": 557391724.0, + "reward": 0.8423638343811035, + "reward_std": 0.1323879361152649, + "rewards/progression_diversity/mean": -0.011668046936392784, + "rewards/progression_diversity/std": 0.06880556792020798, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9638671875, + "rewards/symbolic_reward_partial_score/std": 0.1620253175497055, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0433096885681152, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 3.715100049972534, + "step": 1689 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.3018334209918976, + "epoch": 4.447368421052632, + "grad_norm": 0.013348652981221676, + "learning_rate": 1e-06, + "loss": 0.0512, + "step": 1690 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2954847514629364, + "epoch": 4.45, + "grad_norm": 0.00606268085539341, + "learning_rate": 1e-06, + "loss": 0.0952, + "step": 1691 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.30556538701057434, + "epoch": 4.4526315789473685, + "grad_norm": 0.008453385904431343, + "learning_rate": 1e-06, + "loss": 0.0245, + "step": 1692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3241.0, + "completions/mean_length": 1361.220703125, + "completions/mean_terminated_length": 718.6986083984375, + "completions/min_length": 264.0, + "completions/min_terminated_length": 264.0, + "entropy": 0.2938489019870758, + "epoch": 4.455263157894737, + "frac_reward_zero_std": 0.4375, + "grad_norm": 246.77127075195312, + "learning_rate": 1e-06, + "loss": 0.0901, + "num_tokens": 558478461.0, + "reward": 0.8210785984992981, + "reward_std": 0.12946206331253052, + "rewards/progression_diversity/mean": -0.011284667067229748, + "rewards/progression_diversity/std": 0.06149439141154289, + "rewards/symbolic_reward_accuracy/mean": 0.8984375, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.9495442509651184, + "rewards/symbolic_reward_partial_score/std": 0.181060791015625, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.046877145767212, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 3.4112725257873535, + "step": 1693 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.3013473302125931, + "epoch": 4.457894736842105, + "grad_norm": 0.0174638070166111, + "learning_rate": 1e-06, + "loss": 0.0643, + "step": 1694 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.2852264940738678, + "epoch": 4.4605263157894735, + "grad_norm": 0.011137026362121105, + "learning_rate": 1e-06, + "loss": 0.1265, + "step": 1695 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.30904076993465424, + "epoch": 4.463157894736842, + "grad_norm": 0.007532245479524136, + "learning_rate": 1e-06, + "loss": 0.0096, + "step": 1696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3318.0, + "completions/mean_length": 1733.322265625, + "completions/mean_terminated_length": 853.6749877929688, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "entropy": 0.275277316570282, + "epoch": 4.465789473684211, + "frac_reward_zero_std": 0.5, + "grad_norm": 413.572509765625, + "learning_rate": 1e-06, + "loss": 0.1591, + "num_tokens": 559778914.0, + "reward": 0.8111288547515869, + "reward_std": 0.0881713479757309, + "rewards/progression_diversity/mean": -0.015045925043523312, + "rewards/progression_diversity/std": 0.06671184301376343, + "rewards/symbolic_reward_accuracy/mean": 0.89453125, + "rewards/symbolic_reward_accuracy/std": 0.3074568510055542, + "rewards/symbolic_reward_partial_score/mean": 0.93212890625, + "rewards/symbolic_reward_partial_score/std": 0.21600748598575592, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0425660610198975, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 3.744896650314331, + "step": 1697 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2827970087528229, + "epoch": 4.468421052631579, + "grad_norm": 0.008820455521345139, + "learning_rate": 1e-06, + "loss": 0.0781, + "step": 1698 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.27910783886909485, + "epoch": 4.471052631578948, + "grad_norm": 0.009887280873954296, + "learning_rate": 1e-06, + "loss": 0.0669, + "step": 1699 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2814963161945343, + "epoch": 4.473684210526316, + "grad_norm": 0.011021401733160019, + "learning_rate": 1e-06, + "loss": 0.083, + "step": 1700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2883.0, + "completions/mean_length": 1051.0390625, + "completions/mean_terminated_length": 745.6016235351562, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.29189206659793854, + "epoch": 4.476315789473684, + "frac_reward_zero_std": 0.53125, + "grad_norm": 435.6475830078125, + "learning_rate": 1e-06, + "loss": 0.0678, + "num_tokens": 560717846.0, + "reward": 0.8404079675674438, + "reward_std": 0.11957277357578278, + "rewards/progression_diversity/mean": -0.007058565504848957, + "rewards/progression_diversity/std": 0.05117020010948181, + "rewards/symbolic_reward_accuracy/mean": 0.919921875, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.9676106572151184, + "rewards/symbolic_reward_partial_score/std": 0.13442426919937134, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0530710220336914, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 2.474128484725952, + "step": 1701 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2915034145116806, + "epoch": 4.478947368421053, + "grad_norm": 0.009802755899727345, + "learning_rate": 1e-06, + "loss": 0.0604, + "step": 1702 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.30035799741744995, + "epoch": 4.481578947368421, + "grad_norm": 0.007560526020824909, + "learning_rate": 1e-06, + "loss": 0.0481, + "step": 1703 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.3007005453109741, + "epoch": 4.484210526315789, + "grad_norm": 0.007429624907672405, + "learning_rate": 1e-06, + "loss": 0.0331, + "step": 1704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3036.0, + "completions/mean_length": 1491.83984375, + "completions/mean_terminated_length": 791.3905639648438, + "completions/min_length": 220.0, + "completions/min_terminated_length": 220.0, + "entropy": 0.2913738787174225, + "epoch": 4.4868421052631575, + "frac_reward_zero_std": 0.34375, + "grad_norm": 304.2225341796875, + "learning_rate": 1e-06, + "loss": 0.0793, + "num_tokens": 561894820.0, + "reward": 0.8073344230651855, + "reward_std": 0.1524999737739563, + "rewards/progression_diversity/mean": -0.013631231151521206, + "rewards/progression_diversity/std": 0.06903047114610672, + "rewards/symbolic_reward_accuracy/mean": 0.880859375, + "rewards/symbolic_reward_accuracy/std": 0.32427072525024414, + "rewards/symbolic_reward_partial_score/mean": 0.9435221552848816, + "rewards/symbolic_reward_partial_score/std": 0.19161830842494965, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0431602001190186, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 368.0, + "sampling/sampling_logp_difference/mean": 3.739988088607788, + "step": 1705 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2814719080924988, + "epoch": 4.489473684210527, + "grad_norm": 0.014499134384095669, + "learning_rate": 1e-06, + "loss": 0.1048, + "step": 1706 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.28125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.294638067483902, + "epoch": 4.492105263157895, + "grad_norm": 0.01003054529428482, + "learning_rate": 1e-06, + "loss": 0.0582, + "step": 1707 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2986578643321991, + "epoch": 4.494736842105263, + "grad_norm": 0.014435573481023312, + "learning_rate": 1e-06, + "loss": 0.0615, + "step": 1708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2514.0, + "completions/mean_length": 1135.01953125, + "completions/mean_terminated_length": 769.0440063476562, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.3132117986679077, + "epoch": 4.497368421052632, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.014912238344550133, + "learning_rate": 1e-06, + "loss": 0.0168, + "num_tokens": 562862734.0, + "reward": 0.832605242729187, + "reward_std": 0.09201730787754059, + "rewards/progression_diversity/mean": -0.006079651415348053, + "rewards/progression_diversity/std": 0.044652536511421204, + "rewards/symbolic_reward_accuracy/mean": 0.912109375, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.95849609375, + "rewards/symbolic_reward_partial_score/std": 0.15315403044223785, + "rewards/tag_count_reward/mean": -0.021484375, + "rewards/tag_count_reward/std": 0.14513419568538666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.052751064300537, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 370.0, + "sampling/sampling_logp_difference/mean": 2.5634231567382812, + "step": 1709 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3069326877593994, + "epoch": 4.5, + "grad_norm": 0.00920610036700964, + "learning_rate": 1e-06, + "loss": 0.0197, + "step": 1710 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.28757910430431366, + "epoch": 4.502631578947368, + "grad_norm": 0.006706486456096172, + "learning_rate": 1e-06, + "loss": 0.1082, + "step": 1711 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.29572418332099915, + "epoch": 4.505263157894737, + "grad_norm": 0.008948331698775291, + "learning_rate": 1e-06, + "loss": 0.0682, + "step": 1712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3745.0, + "completions/mean_length": 1313.30078125, + "completions/mean_terminated_length": 795.7212524414062, + "completions/min_length": 271.0, + "completions/min_terminated_length": 271.0, + "entropy": 0.29290150105953217, + "epoch": 4.507894736842105, + "frac_reward_zero_std": 0.59375, + "grad_norm": 150.0733184814453, + "learning_rate": 1e-06, + "loss": 0.0185, + "num_tokens": 563937928.0, + "reward": 0.8227621912956238, + "reward_std": 0.09591831266880035, + "rewards/progression_diversity/mean": -0.008939700201153755, + "rewards/progression_diversity/std": 0.055079780519008636, + "rewards/symbolic_reward_accuracy/mean": 0.896484375, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.958984375, + "rewards/symbolic_reward_partial_score/std": 0.14947035908699036, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0529078245162964, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 370.0, + "sampling/sampling_logp_difference/mean": 2.372925281524658, + "step": 1713 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3026455044746399, + "epoch": 4.510526315789473, + "grad_norm": 0.013394175097346306, + "learning_rate": 1e-06, + "loss": 0.0088, + "step": 1714 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.29875096678733826, + "epoch": 4.5131578947368425, + "grad_norm": 0.007891521789133549, + "learning_rate": 1e-06, + "loss": 0.0854, + "step": 1715 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.29000018537044525, + "epoch": 4.515789473684211, + "grad_norm": 0.010807064361870289, + "learning_rate": 1e-06, + "loss": 0.0695, + "step": 1716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3374.0, + "completions/mean_length": 1286.517578125, + "completions/mean_terminated_length": 830.859130859375, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "entropy": 0.3059823364019394, + "epoch": 4.518421052631579, + "frac_reward_zero_std": 0.65625, + "grad_norm": 41.614864349365234, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 564988241.0, + "reward": 0.8157514333724976, + "reward_std": 0.08176864683628082, + "rewards/progression_diversity/mean": -0.006893161218613386, + "rewards/progression_diversity/std": 0.0425245501101017, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.943359375, + "rewards/symbolic_reward_partial_score/std": 0.18591801822185516, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0574617385864258, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 370.0, + "sampling/sampling_logp_difference/mean": 1.9077733755111694, + "step": 1717 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2892063856124878, + "epoch": 4.521052631578947, + "grad_norm": 0.005814536940306425, + "learning_rate": 1e-06, + "loss": 0.0526, + "step": 1718 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29770365357398987, + "epoch": 4.523684210526316, + "grad_norm": 0.021579096093773842, + "learning_rate": 1e-06, + "loss": 0.0965, + "step": 1719 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.30083292722702026, + "epoch": 4.526315789473684, + "grad_norm": 0.016088807955384254, + "learning_rate": 1e-06, + "loss": 0.0295, + "step": 1720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3407.0, + "completions/mean_length": 1413.296875, + "completions/mean_terminated_length": 899.1515502929688, + "completions/min_length": 286.0, + "completions/min_terminated_length": 286.0, + "entropy": 0.29098886251449585, + "epoch": 4.528947368421052, + "frac_reward_zero_std": 0.5625, + "grad_norm": 66.23799896240234, + "learning_rate": 1e-06, + "loss": 0.0473, + "num_tokens": 566137193.0, + "reward": 0.8391268253326416, + "reward_std": 0.11196374893188477, + "rewards/progression_diversity/mean": -0.008220801129937172, + "rewards/progression_diversity/std": 0.05032823607325554, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9562174081802368, + "rewards/symbolic_reward_partial_score/std": 0.18327513337135315, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0517854690551758, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 372.0, + "sampling/sampling_logp_difference/mean": 2.562303304672241, + "step": 1721 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2914315462112427, + "epoch": 4.531578947368421, + "grad_norm": 7.667601585388184, + "learning_rate": 1e-06, + "loss": 0.043, + "step": 1722 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2990102767944336, + "epoch": 4.534210526315789, + "grad_norm": 0.007618363946676254, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 1723 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.29170943796634674, + "epoch": 4.536842105263158, + "grad_norm": 0.028170811012387276, + "learning_rate": 1e-06, + "loss": 0.0476, + "step": 1724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03515625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3794.0, + "completions/mean_length": 1380.935546875, + "completions/mean_terminated_length": 834.2651977539062, + "completions/min_length": 293.0, + "completions/min_terminated_length": 293.0, + "entropy": 0.2995843440294266, + "epoch": 4.5394736842105265, + "frac_reward_zero_std": 0.5625, + "grad_norm": 356.0476379394531, + "learning_rate": 1e-06, + "loss": 0.053, + "num_tokens": 567248200.0, + "reward": 0.8191208243370056, + "reward_std": 0.08875949680805206, + "rewards/progression_diversity/mean": -0.006864731200039387, + "rewards/progression_diversity/std": 0.0402359738945961, + "rewards/symbolic_reward_accuracy/mean": 0.890625, + "rewards/symbolic_reward_accuracy/std": 0.31241437792778015, + "rewards/symbolic_reward_partial_score/mean": 0.9597981572151184, + "rewards/symbolic_reward_partial_score/std": 0.14745070040225983, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0555503368377686, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 372.0, + "sampling/sampling_logp_difference/mean": 2.510345220565796, + "step": 1725 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30750222504138947, + "epoch": 4.542105263157895, + "grad_norm": 0.007456324994564056, + "learning_rate": 1e-06, + "loss": 0.0425, + "step": 1726 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.29517829418182373, + "epoch": 4.544736842105263, + "grad_norm": 0.017573416233062744, + "learning_rate": 1e-06, + "loss": 0.0846, + "step": 1727 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.3013046234846115, + "epoch": 4.5473684210526315, + "grad_norm": 0.003968199715018272, + "learning_rate": 1e-06, + "loss": 0.0405, + "step": 1728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3952.0, + "completions/mean_length": 1314.697265625, + "completions/mean_terminated_length": 859.8892822265625, + "completions/min_length": 248.0, + "completions/min_terminated_length": 248.0, + "entropy": 0.2868567556142807, + "epoch": 4.55, + "frac_reward_zero_std": 0.5625, + "grad_norm": 259.5559387207031, + "learning_rate": 1e-06, + "loss": 0.0893, + "num_tokens": 568328301.0, + "reward": 0.8367007374763489, + "reward_std": 0.10205796360969543, + "rewards/progression_diversity/mean": -0.006686838809400797, + "rewards/progression_diversity/std": 0.0429992638528347, + "rewards/symbolic_reward_accuracy/mean": 0.919921875, + "rewards/symbolic_reward_accuracy/std": 0.271679550409317, + "rewards/symbolic_reward_partial_score/mean": 0.9591470956802368, + "rewards/symbolic_reward_partial_score/std": 0.17069299519062042, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.058121681213379, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 372.0, + "sampling/sampling_logp_difference/mean": 2.003477096557617, + "step": 1729 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.29739847779273987, + "epoch": 4.552631578947368, + "grad_norm": 0.022392338141798973, + "learning_rate": 1e-06, + "loss": 0.0507, + "step": 1730 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.15625, + "entropy": 0.30387774109840393, + "epoch": 4.5552631578947365, + "grad_norm": 0.00769463088363409, + "learning_rate": 1e-06, + "loss": 0.0124, + "step": 1731 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.2966042011976242, + "epoch": 4.557894736842105, + "grad_norm": 0.012574239633977413, + "learning_rate": 1e-06, + "loss": 0.0445, + "step": 1732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3764.0, + "completions/mean_length": 1427.546875, + "completions/mean_terminated_length": 851.1318359375, + "completions/min_length": 240.0, + "completions/min_terminated_length": 240.0, + "entropy": 0.30412548780441284, + "epoch": 4.560526315789474, + "frac_reward_zero_std": 0.59375, + "grad_norm": 225.99496459960938, + "learning_rate": 1e-06, + "loss": 0.0302, + "num_tokens": 569455493.0, + "reward": 0.826937198638916, + "reward_std": 0.10923168063163757, + "rewards/progression_diversity/mean": -0.006476983428001404, + "rewards/progression_diversity/std": 0.03768179193139076, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9480794072151184, + "rewards/symbolic_reward_partial_score/std": 0.20172148942947388, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0539195537567139, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 372.0, + "sampling/sampling_logp_difference/mean": 2.804877281188965, + "step": 1733 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2931108772754669, + "epoch": 4.563157894736842, + "grad_norm": 0.008534945547580719, + "learning_rate": 1e-06, + "loss": 0.0567, + "step": 1734 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.27719876170158386, + "epoch": 4.565789473684211, + "grad_norm": 0.015061067417263985, + "learning_rate": 1e-06, + "loss": 0.1137, + "step": 1735 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2918764352798462, + "epoch": 4.568421052631579, + "grad_norm": 0.010875429026782513, + "learning_rate": 1e-06, + "loss": 0.0713, + "step": 1736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.009765625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3546.0, + "completions/mean_length": 957.4921875, + "completions/mean_terminated_length": 805.3569946289062, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.31101664900779724, + "epoch": 4.571052631578947, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.010922472923994064, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 570348193.0, + "reward": 0.875369131565094, + "reward_std": 0.06468972563743591, + "rewards/progression_diversity/mean": -0.0021541740279644728, + "rewards/progression_diversity/std": 0.023635899648070335, + "rewards/symbolic_reward_accuracy/mean": 0.96875, + "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, + "rewards/symbolic_reward_partial_score/mean": 0.9837239980697632, + "rewards/symbolic_reward_partial_score/std": 0.10963507741689682, + "rewards/tag_count_reward/mean": -0.009765625, + "rewards/tag_count_reward/std": 0.09843364357948303, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0634751319885254, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 372.0, + "sampling/sampling_logp_difference/mean": 1.4045908451080322, + "step": 1737 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1328125, + "entropy": 0.310005784034729, + "epoch": 4.573684210526316, + "grad_norm": 0.012388059869408607, + "learning_rate": 1e-06, + "loss": -0.0014, + "step": 1738 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.30088475346565247, + "epoch": 4.576315789473684, + "grad_norm": 0.002842206507921219, + "learning_rate": 1e-06, + "loss": 0.0399, + "step": 1739 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.3011449873447418, + "epoch": 4.578947368421053, + "grad_norm": 0.03648176044225693, + "learning_rate": 1e-06, + "loss": 0.087, + "step": 1740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.037109375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3054.0, + "completions/mean_length": 1486.138671875, + "completions/mean_terminated_length": 911.9817504882812, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "entropy": 0.2893022149801254, + "epoch": 4.581578947368421, + "frac_reward_zero_std": 0.5, + "grad_norm": 198.0384521484375, + "learning_rate": 1e-06, + "loss": 0.0774, + "num_tokens": 571519400.0, + "reward": 0.8131746053695679, + "reward_std": 0.091860830783844, + "rewards/progression_diversity/mean": -0.005787876434624195, + "rewards/progression_diversity/std": 0.035509541630744934, + "rewards/symbolic_reward_accuracy/mean": 0.884765625, + "rewards/symbolic_reward_accuracy/std": 0.3196168541908264, + "rewards/symbolic_reward_partial_score/mean": 0.95166015625, + "rewards/symbolic_reward_partial_score/std": 0.1652219146490097, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0542500019073486, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 372.0, + "sampling/sampling_logp_difference/mean": 2.839118003845215, + "step": 1741 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.28446468710899353, + "epoch": 4.58421052631579, + "grad_norm": 0.019028333947062492, + "learning_rate": 1e-06, + "loss": 0.1049, + "step": 1742 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2950809597969055, + "epoch": 4.586842105263158, + "grad_norm": 0.010133459232747555, + "learning_rate": 1e-06, + "loss": 0.0021, + "step": 1743 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.28551898896694183, + "epoch": 4.589473684210526, + "grad_norm": 0.449716717004776, + "learning_rate": 1e-06, + "loss": 0.0884, + "step": 1744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4694.0, + "completions/mean_length": 1332.86328125, + "completions/mean_terminated_length": 1002.399169921875, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.29140228033065796, + "epoch": 4.592105263157895, + "frac_reward_zero_std": 0.59375, + "grad_norm": 71.57343292236328, + "learning_rate": 1e-06, + "loss": 0.0489, + "num_tokens": 572641506.0, + "reward": 0.8343420624732971, + "reward_std": 0.097034752368927, + "rewards/progression_diversity/mean": -0.0032937643118202686, + "rewards/progression_diversity/std": 0.02761550061404705, + "rewards/symbolic_reward_accuracy/mean": 0.9140625, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.958984375, + "rewards/symbolic_reward_partial_score/std": 0.16060270369052887, + "rewards/tag_count_reward/mean": -0.017578125, + "rewards/tag_count_reward/std": 0.13154059648513794, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0581992864608765, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 372.0, + "sampling/sampling_logp_difference/mean": 1.9309183359146118, + "step": 1745 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.297273188829422, + "epoch": 4.594736842105263, + "grad_norm": 0.005121263209730387, + "learning_rate": 1e-06, + "loss": 0.0292, + "step": 1746 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.27994534373283386, + "epoch": 4.597368421052631, + "grad_norm": 0.008685395121574402, + "learning_rate": 1e-06, + "loss": 0.0841, + "step": 1747 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.29429104924201965, + "epoch": 4.6, + "grad_norm": 0.004201680421829224, + "learning_rate": 1e-06, + "loss": 0.018, + "step": 1748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3964.0, + "completions/mean_length": 1609.158203125, + "completions/mean_terminated_length": 1008.5548706054688, + "completions/min_length": 275.0, + "completions/min_terminated_length": 275.0, + "entropy": 0.27888044714927673, + "epoch": 4.602631578947369, + "frac_reward_zero_std": 0.65625, + "grad_norm": 37.15717697143555, + "learning_rate": 1e-06, + "loss": 0.0756, + "num_tokens": 573900403.0, + "reward": 0.8299179673194885, + "reward_std": 0.08020342141389847, + "rewards/progression_diversity/mean": -0.006251877639442682, + "rewards/progression_diversity/std": 0.038109783083200455, + "rewards/symbolic_reward_accuracy/mean": 0.9140625, + "rewards/symbolic_reward_accuracy/std": 0.28054583072662354, + "rewards/symbolic_reward_partial_score/mean": 0.9501953125, + "rewards/symbolic_reward_partial_score/std": 0.183476984500885, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0542418956756592, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 374.0, + "sampling/sampling_logp_difference/mean": 2.3596372604370117, + "step": 1749 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.27801230549812317, + "epoch": 4.605263157894737, + "grad_norm": 213.15374755859375, + "learning_rate": 1e-06, + "loss": 0.1129, + "step": 1750 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2716394513845444, + "epoch": 4.6078947368421055, + "grad_norm": 0.006218337453901768, + "learning_rate": 1e-06, + "loss": 0.1002, + "step": 1751 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2844415009021759, + "epoch": 4.610526315789474, + "grad_norm": 0.013139521703124046, + "learning_rate": 1e-06, + "loss": 0.0223, + "step": 1752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3099.0, + "completions/mean_length": 1402.400390625, + "completions/mean_terminated_length": 919.1229858398438, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.2805843651294708, + "epoch": 4.613157894736842, + "frac_reward_zero_std": 0.625, + "grad_norm": 392.7408142089844, + "learning_rate": 1e-06, + "loss": 0.0717, + "num_tokens": 575023424.0, + "reward": 0.8217633366584778, + "reward_std": 0.07674242556095123, + "rewards/progression_diversity/mean": -0.00628986582159996, + "rewards/progression_diversity/std": 0.040007736533880234, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9444986581802368, + "rewards/symbolic_reward_partial_score/std": 0.19026826322078705, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0610547065734863, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 376.0, + "sampling/sampling_logp_difference/mean": 1.5220887660980225, + "step": 1753 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2966681718826294, + "epoch": 4.61578947368421, + "grad_norm": 0.011541535146534443, + "learning_rate": 1e-06, + "loss": 0.0328, + "step": 1754 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2979959547519684, + "epoch": 4.618421052631579, + "grad_norm": 0.007087912876158953, + "learning_rate": 1e-06, + "loss": 0.0277, + "step": 1755 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.29420797526836395, + "epoch": 4.621052631578947, + "grad_norm": 0.012560022063553333, + "learning_rate": 1e-06, + "loss": 0.0552, + "step": 1756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.021484375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4188.0, + "completions/mean_length": 1243.05859375, + "completions/mean_terminated_length": 910.6227416992188, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.3041873723268509, + "epoch": 4.623684210526315, + "frac_reward_zero_std": 0.59375, + "grad_norm": 191.67491149902344, + "learning_rate": 1e-06, + "loss": 0.0741, + "num_tokens": 576051486.0, + "reward": 0.8431257009506226, + "reward_std": 0.0861952155828476, + "rewards/progression_diversity/mean": -0.0038361886981874704, + "rewards/progression_diversity/std": 0.03068256378173828, + "rewards/symbolic_reward_accuracy/mean": 0.92578125, + "rewards/symbolic_reward_accuracy/std": 0.2623828947544098, + "rewards/symbolic_reward_partial_score/mean": 0.9654947519302368, + "rewards/symbolic_reward_partial_score/std": 0.15342977643013, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0610965490341187, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 376.0, + "sampling/sampling_logp_difference/mean": 1.342087984085083, + "step": 1757 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2971126586198807, + "epoch": 4.626315789473685, + "grad_norm": 0.01794944889843464, + "learning_rate": 1e-06, + "loss": 0.0504, + "step": 1758 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.305801659822464, + "epoch": 4.628947368421053, + "grad_norm": 0.012613825500011444, + "learning_rate": 1e-06, + "loss": 0.0083, + "step": 1759 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.30587300658226013, + "epoch": 4.631578947368421, + "grad_norm": 0.022545624524354935, + "learning_rate": 1e-06, + "loss": 0.0683, + "step": 1760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.02734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3825.0, + "completions/mean_length": 1357.8828125, + "completions/mean_terminated_length": 935.4617919921875, + "completions/min_length": 294.0, + "completions/min_terminated_length": 294.0, + "entropy": 0.30523401498794556, + "epoch": 4.63421052631579, + "frac_reward_zero_std": 0.71875, + "grad_norm": 0.04670589789748192, + "learning_rate": 1e-06, + "loss": 0.0298, + "num_tokens": 577145698.0, + "reward": 0.8240532279014587, + "reward_std": 0.07884141802787781, + "rewards/progression_diversity/mean": -0.006792136933654547, + "rewards/progression_diversity/std": 0.047225385904312134, + "rewards/symbolic_reward_accuracy/mean": 0.904296875, + "rewards/symbolic_reward_accuracy/std": 0.2944713830947876, + "rewards/symbolic_reward_partial_score/mean": 0.9475911259651184, + "rewards/symbolic_reward_partial_score/std": 0.18192888796329498, + "rewards/tag_count_reward/mean": -0.02734375, + "rewards/tag_count_reward/std": 0.16324250400066376, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0601003170013428, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 376.0, + "sampling/sampling_logp_difference/mean": 1.485578179359436, + "step": 1761 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.29327723383903503, + "epoch": 4.636842105263158, + "grad_norm": 0.01936684362590313, + "learning_rate": 1e-06, + "loss": 0.0702, + "step": 1762 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1171875, + "entropy": 0.29366083443164825, + "epoch": 4.639473684210526, + "grad_norm": 0.010277163237333298, + "learning_rate": 1e-06, + "loss": 0.0261, + "step": 1763 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.0546875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.109375, + "entropy": 0.29086172580718994, + "epoch": 4.6421052631578945, + "grad_norm": 0.008378474041819572, + "learning_rate": 1e-06, + "loss": 0.0463, + "step": 1764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.033203125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3414.0, + "completions/mean_length": 1404.08203125, + "completions/mean_terminated_length": 889.6202392578125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "entropy": 0.30439208447933197, + "epoch": 4.644736842105263, + "frac_reward_zero_std": 0.59375, + "grad_norm": 203.2693328857422, + "learning_rate": 1e-06, + "loss": 0.0463, + "num_tokens": 578260716.0, + "reward": 0.8370853662490845, + "reward_std": 0.09505656361579895, + "rewards/progression_diversity/mean": -0.007285004947334528, + "rewards/progression_diversity/std": 0.046391766518354416, + "rewards/symbolic_reward_accuracy/mean": 0.91796875, + "rewards/symbolic_reward_accuracy/std": 0.2746807038784027, + "rewards/symbolic_reward_partial_score/mean": 0.9656575322151184, + "rewards/symbolic_reward_partial_score/std": 0.1476442664861679, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0571269989013672, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 378.0, + "sampling/sampling_logp_difference/mean": 2.06299090385437, + "step": 1765 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29458560049533844, + "epoch": 4.647368421052631, + "grad_norm": 0.011282769963145256, + "learning_rate": 1e-06, + "loss": 0.0713, + "step": 1766 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.0859375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2905200123786926, + "epoch": 4.65, + "grad_norm": 5.091102600097656, + "learning_rate": 1e-06, + "loss": 0.0594, + "step": 1767 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2920839935541153, + "epoch": 4.652631578947369, + "grad_norm": 0.00980228092521429, + "learning_rate": 1e-06, + "loss": 0.0531, + "step": 1768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 12152.0, + "completions/mean_length": 1461.2734375, + "completions/mean_terminated_length": 979.8951416015625, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.3031252771615982, + "epoch": 4.655263157894737, + "frac_reward_zero_std": 0.59375, + "grad_norm": 0.020676128566265106, + "learning_rate": 1e-06, + "loss": 0.0366, + "num_tokens": 579421880.0, + "reward": 0.8634036183357239, + "reward_std": 0.1007419154047966, + "rewards/progression_diversity/mean": -0.007299537770450115, + "rewards/progression_diversity/std": 0.04725031182169914, + "rewards/symbolic_reward_accuracy/mean": 0.958984375, + "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, + "rewards/symbolic_reward_partial_score/mean": 0.9700520634651184, + "rewards/symbolic_reward_partial_score/std": 0.16211473941802979, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0525739192962646, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 380.0, + "sampling/sampling_logp_difference/mean": 2.551374912261963, + "step": 1769 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.28818726539611816, + "epoch": 4.657894736842105, + "grad_norm": 0.013333577662706375, + "learning_rate": 1e-06, + "loss": 0.0613, + "step": 1770 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2797438055276871, + "epoch": 4.660526315789474, + "grad_norm": 12.496295928955078, + "learning_rate": 1e-06, + "loss": 0.089, + "step": 1771 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29146166145801544, + "epoch": 4.663157894736842, + "grad_norm": 7.178304672241211, + "learning_rate": 1e-06, + "loss": 0.0752, + "step": 1772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3135.0, + "completions/mean_length": 1521.27734375, + "completions/mean_terminated_length": 1072.7042236328125, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.28964005410671234, + "epoch": 4.66578947368421, + "frac_reward_zero_std": 0.4375, + "grad_norm": 328.8640441894531, + "learning_rate": 1e-06, + "loss": 0.1406, + "num_tokens": 580629446.0, + "reward": 0.8446449041366577, + "reward_std": 0.14509400725364685, + "rewards/progression_diversity/mean": -0.008165711537003517, + "rewards/progression_diversity/std": 0.05180385336279869, + "rewards/symbolic_reward_accuracy/mean": 0.931640625, + "rewards/symbolic_reward_accuracy/std": 0.25260838866233826, + "rewards/symbolic_reward_partial_score/mean": 0.9622395634651184, + "rewards/symbolic_reward_partial_score/std": 0.16547498106956482, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0513108968734741, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 380.0, + "sampling/sampling_logp_difference/mean": 2.6449103355407715, + "step": 1773 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.2889109253883362, + "epoch": 4.668421052631579, + "grad_norm": 0.018843237310647964, + "learning_rate": 1e-06, + "loss": 0.048, + "step": 1774 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.28207381069660187, + "epoch": 4.671052631578947, + "grad_norm": 0.029016956686973572, + "learning_rate": 1e-06, + "loss": 0.0446, + "step": 1775 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.28926801681518555, + "epoch": 4.673684210526316, + "grad_norm": 0.013139498420059681, + "learning_rate": 1e-06, + "loss": 0.0746, + "step": 1776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3197.0, + "completions/mean_length": 1161.837890625, + "completions/mean_terminated_length": 858.6076049804688, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "entropy": 0.2987944483757019, + "epoch": 4.676315789473684, + "frac_reward_zero_std": 0.65625, + "grad_norm": 314.06134033203125, + "learning_rate": 1e-06, + "loss": 0.0431, + "num_tokens": 581621939.0, + "reward": 0.8670340776443481, + "reward_std": 0.08653353154659271, + "rewards/progression_diversity/mean": -0.005582699552178383, + "rewards/progression_diversity/std": 0.042026419192552567, + "rewards/symbolic_reward_accuracy/mean": 0.958984375, + "rewards/symbolic_reward_accuracy/std": 0.19852031767368317, + "rewards/symbolic_reward_partial_score/mean": 0.9788411259651184, + "rewards/symbolic_reward_partial_score/std": 0.12187561392784119, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.05836021900177, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 380.0, + "sampling/sampling_logp_difference/mean": 2.0134835243225098, + "step": 1777 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.312513142824173, + "epoch": 4.678947368421053, + "grad_norm": 0.004544838331639767, + "learning_rate": 1e-06, + "loss": -0.0029, + "step": 1778 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1640625, + "entropy": 0.28765228390693665, + "epoch": 4.681578947368421, + "grad_norm": 0.00618336908519268, + "learning_rate": 1e-06, + "loss": 0.0752, + "step": 1779 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29780954122543335, + "epoch": 4.684210526315789, + "grad_norm": 0.003846309846267104, + "learning_rate": 1e-06, + "loss": 0.0645, + "step": 1780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3127.0, + "completions/mean_length": 1343.55078125, + "completions/mean_terminated_length": 858.375, + "completions/min_length": 266.0, + "completions/min_terminated_length": 266.0, + "entropy": 0.29853370785713196, + "epoch": 4.686842105263158, + "frac_reward_zero_std": 0.65625, + "grad_norm": 134.65643310546875, + "learning_rate": 1e-06, + "loss": 0.0479, + "num_tokens": 582688941.0, + "reward": 0.8453360795974731, + "reward_std": 0.08855551481246948, + "rewards/progression_diversity/mean": -0.007415060419589281, + "rewards/progression_diversity/std": 0.04754326492547989, + "rewards/symbolic_reward_accuracy/mean": 0.935546875, + "rewards/symbolic_reward_accuracy/std": 0.24579854309558868, + "rewards/symbolic_reward_partial_score/mean": 0.9573568105697632, + "rewards/symbolic_reward_partial_score/std": 0.1826906055212021, + "rewards/tag_count_reward/mean": -0.03125, + "rewards/tag_count_reward/std": 0.17416280508041382, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0548827648162842, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 380.0, + "sampling/sampling_logp_difference/mean": 2.597440242767334, + "step": 1781 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.30938100814819336, + "epoch": 4.689473684210526, + "grad_norm": 0.007459544111043215, + "learning_rate": 1e-06, + "loss": 0.0288, + "step": 1782 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.2973015457391739, + "epoch": 4.692105263157895, + "grad_norm": 0.013295495882630348, + "learning_rate": 1e-06, + "loss": 0.0913, + "step": 1783 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.29738104343414307, + "epoch": 4.6947368421052635, + "grad_norm": 0.008674710988998413, + "learning_rate": 1e-06, + "loss": 0.1016, + "step": 1784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.048828125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8991.0, + "completions/mean_length": 1652.69921875, + "completions/mean_terminated_length": 896.4722900390625, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.2916813790798187, + "epoch": 4.697368421052632, + "frac_reward_zero_std": 0.53125, + "grad_norm": 311.70086669921875, + "learning_rate": 1e-06, + "loss": 0.1287, + "num_tokens": 583923411.0, + "reward": 0.816964864730835, + "reward_std": 0.0918387770652771, + "rewards/progression_diversity/mean": -0.00762150390073657, + "rewards/progression_diversity/std": 0.04115252196788788, + "rewards/symbolic_reward_accuracy/mean": 0.896484375, + "rewards/symbolic_reward_accuracy/std": 0.30492907762527466, + "rewards/symbolic_reward_partial_score/mean": 0.9454752802848816, + "rewards/symbolic_reward_partial_score/std": 0.191688671708107, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0515938997268677, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 380.0, + "sampling/sampling_logp_difference/mean": 2.775969982147217, + "step": 1785 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2890394926071167, + "epoch": 4.7, + "grad_norm": 0.01550104096531868, + "learning_rate": 1e-06, + "loss": 0.1227, + "step": 1786 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2989995628595352, + "epoch": 4.7026315789473685, + "grad_norm": 0.008528691716492176, + "learning_rate": 1e-06, + "loss": 0.0246, + "step": 1787 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.29257068037986755, + "epoch": 4.705263157894737, + "grad_norm": 0.010270673781633377, + "learning_rate": 1e-06, + "loss": 0.0584, + "step": 1788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.029296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3506.0, + "completions/mean_length": 1359.19140625, + "completions/mean_terminated_length": 905.726318359375, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "entropy": 0.3098142296075821, + "epoch": 4.707894736842105, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.009268060326576233, + "learning_rate": 1e-06, + "loss": -0.0146, + "num_tokens": 585016949.0, + "reward": 0.8493025302886963, + "reward_std": 0.07921440899372101, + "rewards/progression_diversity/mean": -0.006267758086323738, + "rewards/progression_diversity/std": 0.039985544979572296, + "rewards/symbolic_reward_accuracy/mean": 0.9375, + "rewards/symbolic_reward_accuracy/std": 0.2422981858253479, + "rewards/symbolic_reward_partial_score/mean": 0.9646809697151184, + "rewards/symbolic_reward_partial_score/std": 0.15783125162124634, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.058639645576477, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 380.0, + "sampling/sampling_logp_difference/mean": 1.931031346321106, + "step": 1789 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3012120723724365, + "epoch": 4.7105263157894735, + "grad_norm": 0.018348556011915207, + "learning_rate": 1e-06, + "loss": 0.0325, + "step": 1790 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.3086209297180176, + "epoch": 4.713157894736842, + "grad_norm": 0.03524219989776611, + "learning_rate": 1e-06, + "loss": 0.0427, + "step": 1791 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.29122862219810486, + "epoch": 4.715789473684211, + "grad_norm": 0.004257425665855408, + "learning_rate": 1e-06, + "loss": 0.1065, + "step": 1792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.044921875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3446.0, + "completions/mean_length": 1595.56640625, + "completions/mean_terminated_length": 899.9959106445312, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.287975549697876, + "epoch": 4.718421052631579, + "frac_reward_zero_std": 0.5, + "grad_norm": 392.2756042480469, + "learning_rate": 1e-06, + "loss": 0.1354, + "num_tokens": 586220343.0, + "reward": 0.8267496824264526, + "reward_std": 0.13566027581691742, + "rewards/progression_diversity/mean": -0.010585745796561241, + "rewards/progression_diversity/std": 0.05752396956086159, + "rewards/symbolic_reward_accuracy/mean": 0.912109375, + "rewards/symbolic_reward_accuracy/std": 0.2834126651287079, + "rewards/symbolic_reward_partial_score/mean": 0.9462890625, + "rewards/symbolic_reward_partial_score/std": 0.20108237862586975, + "rewards/tag_count_reward/mean": -0.04296875, + "rewards/tag_count_reward/std": 0.2029850035905838, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.048526644706726, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 380.0, + "sampling/sampling_logp_difference/mean": 3.1703460216522217, + "step": 1793 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.29304662346839905, + "epoch": 4.721052631578948, + "grad_norm": 282.28704833984375, + "learning_rate": 1e-06, + "loss": 0.1636, + "step": 1794 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.29584261775016785, + "epoch": 4.723684210526316, + "grad_norm": 0.01929856278002262, + "learning_rate": 1e-06, + "loss": 0.1043, + "step": 1795 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.30293792486190796, + "epoch": 4.726315789473684, + "grad_norm": 0.007616270799189806, + "learning_rate": 1e-06, + "loss": 0.0534, + "step": 1796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01953125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3592.0, + "completions/mean_length": 1216.298828125, + "completions/mean_terminated_length": 914.1534423828125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.2999269813299179, + "epoch": 4.728947368421053, + "frac_reward_zero_std": 0.65625, + "grad_norm": 63.879966735839844, + "learning_rate": 1e-06, + "loss": 0.0632, + "num_tokens": 587246896.0, + "reward": 0.8688350915908813, + "reward_std": 0.0851588174700737, + "rewards/progression_diversity/mean": -0.0061400942504405975, + "rewards/progression_diversity/std": 0.04974029213190079, + "rewards/symbolic_reward_accuracy/mean": 0.962890625, + "rewards/symbolic_reward_accuracy/std": 0.18921469151973724, + "rewards/symbolic_reward_partial_score/mean": 0.97705078125, + "rewards/symbolic_reward_partial_score/std": 0.13179610669612885, + "rewards/tag_count_reward/mean": -0.01953125, + "rewards/tag_count_reward/std": 0.1385180652141571, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0599864721298218, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 382.0, + "sampling/sampling_logp_difference/mean": 1.3513113260269165, + "step": 1797 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.30788634717464447, + "epoch": 4.731578947368421, + "grad_norm": 0.03443191573023796, + "learning_rate": 1e-06, + "loss": 0.0174, + "step": 1798 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.2986367344856262, + "epoch": 4.734210526315789, + "grad_norm": 0.0047141476534307, + "learning_rate": 1e-06, + "loss": 0.6458, + "step": 1799 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.140625, + "entropy": 0.29760782420635223, + "epoch": 4.7368421052631575, + "grad_norm": 0.0066524529829621315, + "learning_rate": 1e-06, + "loss": 0.0348, + "step": 1800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.07421875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3175.0, + "completions/mean_length": 2149.83984375, + "completions/mean_terminated_length": 1008.70458984375, + "completions/min_length": 265.0, + "completions/min_terminated_length": 265.0, + "entropy": 0.28144165873527527, + "epoch": 4.739473684210527, + "frac_reward_zero_std": 0.5625, + "grad_norm": 226.54739379882812, + "learning_rate": 1e-06, + "loss": 0.083, + "num_tokens": 588753086.0, + "reward": 0.779984712600708, + "reward_std": 0.13797515630722046, + "rewards/progression_diversity/mean": -0.019111623987555504, + "rewards/progression_diversity/std": 0.0777515172958374, + "rewards/symbolic_reward_accuracy/mean": 0.849609375, + "rewards/symbolic_reward_accuracy/std": 0.35780346393585205, + "rewards/symbolic_reward_partial_score/mean": 0.9261067509651184, + "rewards/symbolic_reward_partial_score/std": 0.22488640248775482, + "rewards/tag_count_reward/mean": -0.07421875, + "rewards/tag_count_reward/std": 0.2623828947544098, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0423637628555298, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 384.0, + "sampling/sampling_logp_difference/mean": 3.5376710891723633, + "step": 1801 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2867640554904938, + "epoch": 4.742105263157895, + "grad_norm": 0.025515422224998474, + "learning_rate": 1e-06, + "loss": 0.1148, + "step": 1802 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.109375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.27136462926864624, + "epoch": 4.744736842105263, + "grad_norm": 0.008495149202644825, + "learning_rate": 1e-06, + "loss": 0.1537, + "step": 1803 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.2758960574865341, + "epoch": 4.747368421052632, + "grad_norm": 0.33327996730804443, + "learning_rate": 1e-06, + "loss": 0.0768, + "step": 1804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.04296875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 9181.0, + "completions/mean_length": 1735.49609375, + "completions/mean_terminated_length": 1077.80810546875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.2854105681180954, + "epoch": 4.75, + "frac_reward_zero_std": 0.375, + "grad_norm": 10.029664039611816, + "learning_rate": 1e-06, + "loss": 0.0718, + "num_tokens": 590060988.0, + "reward": 0.8515157699584961, + "reward_std": 0.12311355024576187, + "rewards/progression_diversity/mean": -0.014438299462199211, + "rewards/progression_diversity/std": 0.07580766081809998, + "rewards/symbolic_reward_accuracy/mean": 0.943359375, + "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, + "rewards/symbolic_reward_partial_score/mean": 0.9671223759651184, + "rewards/symbolic_reward_partial_score/std": 0.15571942925453186, + "rewards/tag_count_reward/mean": -0.044921875, + "rewards/tag_count_reward/std": 0.20733514428138733, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.04230797290802, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 386.0, + "sampling/sampling_logp_difference/mean": 3.432555675506592, + "step": 1805 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1953125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2759237736463547, + "epoch": 4.752631578947368, + "grad_norm": 1959.9281005859375, + "learning_rate": 1e-06, + "loss": 0.2892, + "step": 1806 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.28651775419712067, + "epoch": 4.755263157894737, + "grad_norm": 352.3994140625, + "learning_rate": 1e-06, + "loss": 0.1454, + "step": 1807 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2970658540725708, + "epoch": 4.757894736842105, + "grad_norm": 0.0121544124558568, + "learning_rate": 1e-06, + "loss": 0.0625, + "step": 1808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3052.0, + "completions/mean_length": 1934.1640625, + "completions/mean_terminated_length": 938.6638793945312, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "entropy": 0.27560150623321533, + "epoch": 4.760526315789473, + "frac_reward_zero_std": 0.40625, + "grad_norm": 242.6947479248047, + "learning_rate": 1e-06, + "loss": 0.1073, + "num_tokens": 591441072.0, + "reward": 0.8055234551429749, + "reward_std": 0.10336482524871826, + "rewards/progression_diversity/mean": -0.01895313523709774, + "rewards/progression_diversity/std": 0.08070053160190582, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.93896484375, + "rewards/symbolic_reward_partial_score/std": 0.19326075911521912, + "rewards/tag_count_reward/mean": -0.056640625, + "rewards/tag_count_reward/std": 0.23138070106506348, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0425719022750854, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 388.0, + "sampling/sampling_logp_difference/mean": 3.5746545791625977, + "step": 1809 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.28119949996471405, + "epoch": 4.7631578947368425, + "grad_norm": 0.02790587767958641, + "learning_rate": 1e-06, + "loss": 0.1284, + "step": 1810 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2921789288520813, + "epoch": 4.765789473684211, + "grad_norm": 0.060401126742362976, + "learning_rate": 1e-06, + "loss": 0.0706, + "step": 1811 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3359375, + "entropy": 0.28399544954299927, + "epoch": 4.768421052631579, + "grad_norm": 0.10208897292613983, + "learning_rate": 1e-06, + "loss": 0.0961, + "step": 1812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3719.0, + "completions/mean_length": 1791.84375, + "completions/mean_terminated_length": 819.0333862304688, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "entropy": 0.2930610924959183, + "epoch": 4.771052631578947, + "frac_reward_zero_std": 0.5625, + "grad_norm": 260.9751892089844, + "learning_rate": 1e-06, + "loss": 0.0712, + "num_tokens": 592748288.0, + "reward": 0.7997059226036072, + "reward_std": 0.09009853005409241, + "rewards/progression_diversity/mean": -0.014760013669729233, + "rewards/progression_diversity/std": 0.06976296752691269, + "rewards/symbolic_reward_accuracy/mean": 0.876953125, + "rewards/symbolic_reward_accuracy/std": 0.32881227135658264, + "rewards/symbolic_reward_partial_score/mean": 0.9298502206802368, + "rewards/symbolic_reward_partial_score/std": 0.21666119992733002, + "rewards/tag_count_reward/mean": -0.052734375, + "rewards/tag_count_reward/std": 0.22372129559516907, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0533522367477417, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 388.0, + "sampling/sampling_logp_difference/mean": 2.341224431991577, + "step": 1813 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.21875, + "entropy": 0.29311226308345795, + "epoch": 4.773684210526316, + "grad_norm": 0.006627992261201143, + "learning_rate": 1e-06, + "loss": 0.0818, + "step": 1814 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.203125, + "entropy": 0.2873583137989044, + "epoch": 4.776315789473684, + "grad_norm": 0.0039825947023928165, + "learning_rate": 1e-06, + "loss": 0.0784, + "step": 1815 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.1015625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.29943425953388214, + "epoch": 4.778947368421052, + "grad_norm": 0.006021894048899412, + "learning_rate": 1e-06, + "loss": 0.0369, + "step": 1816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3120.0, + "completions/mean_length": 1341.46875, + "completions/mean_terminated_length": 949.5791625976562, + "completions/min_length": 207.0, + "completions/min_terminated_length": 207.0, + "entropy": 0.3028171956539154, + "epoch": 4.781578947368421, + "frac_reward_zero_std": 0.34375, + "grad_norm": 35.22563552856445, + "learning_rate": 1e-06, + "loss": 0.0162, + "num_tokens": 593841616.0, + "reward": 0.8557599782943726, + "reward_std": 0.1146036684513092, + "rewards/progression_diversity/mean": -0.009937912225723267, + "rewards/progression_diversity/std": 0.0624278225004673, + "rewards/symbolic_reward_accuracy/mean": 0.943359375, + "rewards/symbolic_reward_accuracy/std": 0.23138070106506348, + "rewards/symbolic_reward_partial_score/mean": 0.974609375, + "rewards/symbolic_reward_partial_score/std": 0.12061332911252975, + "rewards/tag_count_reward/mean": -0.025390625, + "rewards/tag_count_reward/std": 0.15746226906776428, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.052259922027588, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 390.0, + "sampling/sampling_logp_difference/mean": 2.46126127243042, + "step": 1817 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.2946004569530487, + "epoch": 4.784210526315789, + "grad_norm": 524.8899536132812, + "learning_rate": 1e-06, + "loss": 0.1229, + "step": 1818 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.0234375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.29525619745254517, + "epoch": 4.786842105263158, + "grad_norm": 0.0062756622210145, + "learning_rate": 1e-06, + "loss": 0.0398, + "step": 1819 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.265625, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2876027226448059, + "epoch": 4.7894736842105265, + "grad_norm": 0.011834767647087574, + "learning_rate": 1e-06, + "loss": 0.072, + "step": 1820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 8261.0, + "completions/mean_length": 1432.533203125, + "completions/mean_terminated_length": 950.227783203125, + "completions/min_length": 243.0, + "completions/min_terminated_length": 243.0, + "entropy": 0.294967457652092, + "epoch": 4.792105263157895, + "frac_reward_zero_std": 0.59375, + "grad_norm": 206.98422241210938, + "learning_rate": 1e-06, + "loss": 0.0693, + "num_tokens": 594979361.0, + "reward": 0.8551140427589417, + "reward_std": 0.08930052816867828, + "rewards/progression_diversity/mean": -0.011060354299843311, + "rewards/progression_diversity/std": 0.0651150494813919, + "rewards/symbolic_reward_accuracy/mean": 0.9453125, + "rewards/symbolic_reward_accuracy/std": 0.2275916188955307, + "rewards/symbolic_reward_partial_score/mean": 0.97119140625, + "rewards/symbolic_reward_partial_score/std": 0.14763346314430237, + "rewards/tag_count_reward/mean": -0.033203125, + "rewards/tag_count_reward/std": 0.17934183776378632, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0505045652389526, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 392.0, + "sampling/sampling_logp_difference/mean": 2.7975821495056152, + "step": 1821 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.29428502917289734, + "epoch": 4.794736842105263, + "grad_norm": 0.00786570180207491, + "learning_rate": 1e-06, + "loss": 0.0222, + "step": 1822 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.2888794392347336, + "epoch": 4.7973684210526315, + "grad_norm": 0.015527124516665936, + "learning_rate": 1e-06, + "loss": 0.0965, + "step": 1823 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.2929443120956421, + "epoch": 4.8, + "grad_norm": 0.0051057226955890656, + "learning_rate": 1e-06, + "loss": 0.0823, + "step": 1824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 5839.0, + "completions/mean_length": 1806.5625, + "completions/mean_terminated_length": 963.2396240234375, + "completions/min_length": 284.0, + "completions/min_terminated_length": 284.0, + "entropy": 0.2864207327365875, + "epoch": 4.802631578947368, + "frac_reward_zero_std": 0.59375, + "grad_norm": 191.23019409179688, + "learning_rate": 1e-06, + "loss": 0.0386, + "num_tokens": 596299105.0, + "reward": 0.7930800914764404, + "reward_std": 0.09042888879776001, + "rewards/progression_diversity/mean": -0.01328323408961296, + "rewards/progression_diversity/std": 0.06475852429866791, + "rewards/symbolic_reward_accuracy/mean": 0.865234375, + "rewards/symbolic_reward_accuracy/std": 0.3418070077896118, + "rewards/symbolic_reward_partial_score/mean": 0.9265950322151184, + "rewards/symbolic_reward_partial_score/std": 0.22019346058368683, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0515691041946411, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 392.0, + "sampling/sampling_logp_difference/mean": 2.668024778366089, + "step": 1825 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.26454444229602814, + "epoch": 4.8052631578947365, + "grad_norm": 0.004221981856971979, + "learning_rate": 1e-06, + "loss": 0.1982, + "step": 1826 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.2992611974477768, + "epoch": 4.807894736842105, + "grad_norm": 0.009702647104859352, + "learning_rate": 1e-06, + "loss": 0.0082, + "step": 1827 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.28555387258529663, + "epoch": 4.810526315789474, + "grad_norm": 0.007640550844371319, + "learning_rate": 1e-06, + "loss": 0.0761, + "step": 1828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 1518.830078125, + "completions/mean_terminated_length": 914.5548706054688, + "completions/min_length": 262.0, + "completions/min_terminated_length": 262.0, + "entropy": 0.2895033657550812, + "epoch": 4.813157894736842, + "frac_reward_zero_std": 0.4375, + "grad_norm": 482.5952453613281, + "learning_rate": 1e-06, + "loss": 0.0888, + "num_tokens": 597485546.0, + "reward": 0.8472241163253784, + "reward_std": 0.12179014086723328, + "rewards/progression_diversity/mean": -0.013924511149525642, + "rewards/progression_diversity/std": 0.07416489720344543, + "rewards/symbolic_reward_accuracy/mean": 0.93359375, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.9703775644302368, + "rewards/symbolic_reward_partial_score/std": 0.1422656625509262, + "rewards/tag_count_reward/mean": -0.0390625, + "rewards/tag_count_reward/std": 0.1939331740140915, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0476664304733276, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 392.0, + "sampling/sampling_logp_difference/mean": 3.4861273765563965, + "step": 1829 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.30734431743621826, + "epoch": 4.815789473684211, + "grad_norm": 0.01676630787551403, + "learning_rate": 1e-06, + "loss": 0.0315, + "step": 1830 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2916211038827896, + "epoch": 4.818421052631579, + "grad_norm": 0.01112330798059702, + "learning_rate": 1e-06, + "loss": 0.0759, + "step": 1831 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.2971077710390091, + "epoch": 4.821052631578947, + "grad_norm": 0.006531843915581703, + "learning_rate": 1e-06, + "loss": 0.0779, + "step": 1832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3569.0, + "completions/mean_length": 1641.30859375, + "completions/mean_terminated_length": 1010.7658081054688, + "completions/min_length": 273.0, + "completions/min_terminated_length": 273.0, + "entropy": 0.2911697328090668, + "epoch": 4.823684210526316, + "frac_reward_zero_std": 0.53125, + "grad_norm": 0.01622670330107212, + "learning_rate": 1e-06, + "loss": 0.0771, + "num_tokens": 598745384.0, + "reward": 0.8167889714241028, + "reward_std": 0.10516804456710815, + "rewards/progression_diversity/mean": -0.010561157017946243, + "rewards/progression_diversity/std": 0.05700741335749626, + "rewards/symbolic_reward_accuracy/mean": 0.8984375, + "rewards/symbolic_reward_accuracy/std": 0.30236753821372986, + "rewards/symbolic_reward_partial_score/mean": 0.9378255605697632, + "rewards/symbolic_reward_partial_score/std": 0.2107086479663849, + "rewards/tag_count_reward/mean": -0.03515625, + "rewards/tag_count_reward/std": 0.1843547374010086, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.045763373374939, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 394.0, + "sampling/sampling_logp_difference/mean": 3.4746012687683105, + "step": 1833 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.28375144302845, + "epoch": 4.826315789473684, + "grad_norm": 0.007259514648467302, + "learning_rate": 1e-06, + "loss": 0.0821, + "step": 1834 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1328125, + "clip_ratio/low_mean": 0.0390625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.171875, + "entropy": 0.28915947675704956, + "epoch": 4.828947368421053, + "grad_norm": 0.020610112696886063, + "learning_rate": 1e-06, + "loss": 0.0623, + "step": 1835 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.171875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2865881621837616, + "epoch": 4.831578947368421, + "grad_norm": 0.014592897146940231, + "learning_rate": 1e-06, + "loss": 0.1088, + "step": 1836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.041015625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3561.0, + "completions/mean_length": 1590.58984375, + "completions/mean_terminated_length": 957.8778686523438, + "completions/min_length": 260.0, + "completions/min_terminated_length": 260.0, + "entropy": 0.29244448244571686, + "epoch": 4.83421052631579, + "frac_reward_zero_std": 0.5, + "grad_norm": 432.8992919921875, + "learning_rate": 1e-06, + "loss": 0.042, + "num_tokens": 599961590.0, + "reward": 0.8015085458755493, + "reward_std": 0.11123409122228622, + "rewards/progression_diversity/mean": -0.010282262228429317, + "rewards/progression_diversity/std": 0.05530929192900658, + "rewards/symbolic_reward_accuracy/mean": 0.87109375, + "rewards/symbolic_reward_accuracy/std": 0.33542385697364807, + "rewards/symbolic_reward_partial_score/mean": 0.9435220956802368, + "rewards/symbolic_reward_partial_score/std": 0.17923220992088318, + "rewards/tag_count_reward/mean": -0.041015625, + "rewards/tag_count_reward/std": 0.19852031767368317, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0479365587234497, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 396.0, + "sampling/sampling_logp_difference/mean": 3.16892409324646, + "step": 1837 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1484375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.28493815660476685, + "epoch": 4.836842105263158, + "grad_norm": 0.014859353192150593, + "learning_rate": 1e-06, + "loss": 0.0579, + "step": 1838 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.29050807654857635, + "epoch": 4.839473684210526, + "grad_norm": 0.00957922451198101, + "learning_rate": 1e-06, + "loss": 0.0393, + "step": 1839 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.28927716612815857, + "epoch": 4.842105263157895, + "grad_norm": 0.018363086506724358, + "learning_rate": 1e-06, + "loss": 0.0646, + "step": 1840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4263.0, + "completions/mean_length": 961.29296875, + "completions/mean_terminated_length": 839.8543090820312, + "completions/min_length": 251.0, + "completions/min_terminated_length": 251.0, + "entropy": 0.30787861347198486, + "epoch": 4.844736842105263, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.09452533721923828, + "learning_rate": 1e-06, + "loss": 0.0309, + "num_tokens": 600826380.0, + "reward": 0.8753113746643066, + "reward_std": 0.08123890310525894, + "rewards/progression_diversity/mean": -0.0030442136339843273, + "rewards/progression_diversity/std": 0.038930658251047134, + "rewards/symbolic_reward_accuracy/mean": 0.96875, + "rewards/symbolic_reward_accuracy/std": 0.17416280508041382, + "rewards/symbolic_reward_partial_score/mean": 0.98291015625, + "rewards/symbolic_reward_partial_score/std": 0.1046241894364357, + "rewards/tag_count_reward/mean": -0.0078125, + "rewards/tag_count_reward/std": 0.08812850713729858, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.063698410987854, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 394.0, + "sampling/sampling_logp_difference/mean": 1.2699187994003296, + "step": 1841 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1484375, + "entropy": 0.31785525381565094, + "epoch": 4.847368421052631, + "grad_norm": 36.97111511230469, + "learning_rate": 1e-06, + "loss": 0.0007, + "step": 1842 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.15625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.3095642328262329, + "epoch": 4.85, + "grad_norm": 0.019375812262296677, + "learning_rate": 1e-06, + "loss": 0.0321, + "step": 1843 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1796875, + "entropy": 0.3243100643157959, + "epoch": 4.852631578947369, + "grad_norm": 0.00351691129617393, + "learning_rate": 1e-06, + "loss": -0.0018, + "step": 1844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.060546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4850.0, + "completions/mean_length": 1875.322265625, + "completions/mean_terminated_length": 940.2515869140625, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.2960711419582367, + "epoch": 4.855263157894737, + "frac_reward_zero_std": 0.5, + "grad_norm": 743.4161376953125, + "learning_rate": 1e-06, + "loss": 0.0685, + "num_tokens": 602193201.0, + "reward": 0.8039959669113159, + "reward_std": 0.11662513762712479, + "rewards/progression_diversity/mean": -0.015446132980287075, + "rewards/progression_diversity/std": 0.07263787090778351, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.93310546875, + "rewards/symbolic_reward_partial_score/std": 0.21881107985973358, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0418115854263306, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 396.0, + "sampling/sampling_logp_difference/mean": 4.129971981048584, + "step": 1845 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.125, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.28108513355255127, + "epoch": 4.8578947368421055, + "grad_norm": 685.2288818359375, + "learning_rate": 1e-06, + "loss": 0.2488, + "step": 1846 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1953125, + "entropy": 0.28235989809036255, + "epoch": 4.860526315789474, + "grad_norm": 0.029184581711888313, + "learning_rate": 1e-06, + "loss": 0.1114, + "step": 1847 + }, + { + "clip_ratio/high_max": 0.5, + "clip_ratio/high_mean": 0.0703125, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.1875, + "entropy": 0.28090812265872955, + "epoch": 4.863157894736842, + "grad_norm": 0.2511216998100281, + "learning_rate": 1e-06, + "loss": 0.0608, + "step": 1848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.095703125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3051.0, + "completions/mean_length": 2357.298828125, + "completions/mean_terminated_length": 872.8314819335938, + "completions/min_length": 270.0, + "completions/min_terminated_length": 270.0, + "entropy": 0.26737289130687714, + "epoch": 4.86578947368421, + "frac_reward_zero_std": 0.3125, + "grad_norm": 318.45111083984375, + "learning_rate": 1e-06, + "loss": 0.1068, + "num_tokens": 603795082.0, + "reward": 0.7866251468658447, + "reward_std": 0.13819053769111633, + "rewards/progression_diversity/mean": -0.028891239315271378, + "rewards/progression_diversity/std": 0.10002173483371735, + "rewards/symbolic_reward_accuracy/mean": 0.869140625, + "rewards/symbolic_reward_accuracy/std": 0.33757632970809937, + "rewards/symbolic_reward_partial_score/mean": 0.91015625, + "rewards/symbolic_reward_partial_score/std": 0.25601255893707275, + "rewards/tag_count_reward/mean": -0.076171875, + "rewards/tag_count_reward/std": 0.26553234457969666, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0239213705062866, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 396.0, + "sampling/sampling_logp_difference/mean": 6.760751247406006, + "step": 1849 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2813594937324524, + "epoch": 4.868421052631579, + "grad_norm": 371.5801086425781, + "learning_rate": 1e-06, + "loss": 0.1545, + "step": 1850 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3671875, + "entropy": 0.25378528982400894, + "epoch": 4.871052631578947, + "grad_norm": 1.993329405784607, + "learning_rate": 1e-06, + "loss": 0.2296, + "step": 1851 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2722862958908081, + "epoch": 4.873684210526315, + "grad_norm": 0.014913872815668583, + "learning_rate": 1e-06, + "loss": 0.1323, + "step": 1852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.119140625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3017.0, + "completions/mean_length": 2810.728515625, + "completions/mean_terminated_length": 974.8757934570312, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "entropy": 0.2696688771247864, + "epoch": 4.876315789473685, + "frac_reward_zero_std": 0.34375, + "grad_norm": 104.42438507080078, + "learning_rate": 1e-06, + "loss": 0.0423, + "num_tokens": 605653183.0, + "reward": 0.7709691524505615, + "reward_std": 0.15861886739730835, + "rewards/progression_diversity/mean": -0.03199107199907303, + "rewards/progression_diversity/std": 0.09810096770524979, + "rewards/symbolic_reward_accuracy/mean": 0.85546875, + "rewards/symbolic_reward_accuracy/std": 0.35197147727012634, + "rewards/symbolic_reward_partial_score/mean": 0.8893228769302368, + "rewards/symbolic_reward_partial_score/std": 0.2945672869682312, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0273919105529785, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 400.0, + "sampling/sampling_logp_difference/mean": 5.841981887817383, + "step": 1853 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.25676195323467255, + "epoch": 4.878947368421053, + "grad_norm": 27.663148880004883, + "learning_rate": 1e-06, + "loss": 0.1787, + "step": 1854 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.1796875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.390625, + "entropy": 0.23987554013729095, + "epoch": 4.881578947368421, + "grad_norm": 0.01363349985331297, + "learning_rate": 1e-06, + "loss": 0.2546, + "step": 1855 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.1171875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.28125, + "entropy": 0.26036641001701355, + "epoch": 4.88421052631579, + "grad_norm": 0.026609908789396286, + "learning_rate": 1e-06, + "loss": 0.0951, + "step": 1856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.091796875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3775.0, + "completions/mean_length": 2387.767578125, + "completions/mean_terminated_length": 973.0946655273438, + "completions/min_length": 302.0, + "completions/min_terminated_length": 302.0, + "entropy": 0.2557432949542999, + "epoch": 4.886842105263158, + "frac_reward_zero_std": 0.34375, + "grad_norm": 442.3005065917969, + "learning_rate": 1e-06, + "loss": 0.1959, + "num_tokens": 607298216.0, + "reward": 0.7776139974594116, + "reward_std": 0.15359753370285034, + "rewards/progression_diversity/mean": -0.03156745806336403, + "rewards/progression_diversity/std": 0.10606934875249863, + "rewards/symbolic_reward_accuracy/mean": 0.859375, + "rewards/symbolic_reward_accuracy/std": 0.3479743003845215, + "rewards/symbolic_reward_partial_score/mean": 0.8958333730697632, + "rewards/symbolic_reward_partial_score/std": 0.2811386287212372, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0301284790039062, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 400.0, + "sampling/sampling_logp_difference/mean": 5.681142807006836, + "step": 1857 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.2727195918560028, + "epoch": 4.889473684210526, + "grad_norm": 266.2679443359375, + "learning_rate": 1e-06, + "loss": 0.1627, + "step": 1858 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.21875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.2670164704322815, + "epoch": 4.8921052631578945, + "grad_norm": 0.010746384039521217, + "learning_rate": 1e-06, + "loss": 0.1468, + "step": 1859 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.234375, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.27139565348625183, + "epoch": 4.894736842105263, + "grad_norm": 0.0044360412284731865, + "learning_rate": 1e-06, + "loss": 0.1136, + "step": 1860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0390625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3370.0, + "completions/mean_length": 1420.21484375, + "completions/mean_terminated_length": 811.9308471679688, + "completions/min_length": 279.0, + "completions/min_terminated_length": 279.0, + "entropy": 0.2882204055786133, + "epoch": 4.897368421052631, + "frac_reward_zero_std": 0.5, + "grad_norm": 773.8041381835938, + "learning_rate": 1e-06, + "loss": 0.1389, + "num_tokens": 608393302.0, + "reward": 0.8496336936950684, + "reward_std": 0.10768449306488037, + "rewards/progression_diversity/mean": -0.017098724842071533, + "rewards/progression_diversity/std": 0.08937934786081314, + "rewards/symbolic_reward_accuracy/mean": 0.939453125, + "rewards/symbolic_reward_accuracy/std": 0.2387305200099945, + "rewards/symbolic_reward_partial_score/mean": 0.9635416269302368, + "rewards/symbolic_reward_partial_score/std": 0.16535688936710358, + "rewards/tag_count_reward/mean": -0.029296875, + "rewards/tag_count_reward/std": 0.16880230605602264, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0443835258483887, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 402.0, + "sampling/sampling_logp_difference/mean": 3.879685878753662, + "step": 1861 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.30971573293209076, + "epoch": 4.9, + "grad_norm": 0.017872009426355362, + "learning_rate": 1e-06, + "loss": 0.0584, + "step": 1862 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2265625, + "entropy": 0.30385591089725494, + "epoch": 4.902631578947369, + "grad_norm": 0.0137720238417387, + "learning_rate": 1e-06, + "loss": 0.0668, + "step": 1863 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2265625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.28459571301937103, + "epoch": 4.905263157894737, + "grad_norm": 0.005599349737167358, + "learning_rate": 1e-06, + "loss": 0.138, + "step": 1864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3664.0, + "completions/mean_length": 2037.166015625, + "completions/mean_terminated_length": 854.2346801757812, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "entropy": 0.29497842490673065, + "epoch": 4.907894736842105, + "frac_reward_zero_std": 0.46875, + "grad_norm": 3.637153148651123, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 609837131.0, + "reward": 0.8191713094711304, + "reward_std": 0.12532027065753937, + "rewards/progression_diversity/mean": -0.03111710585653782, + "rewards/progression_diversity/std": 0.11476296186447144, + "rewards/symbolic_reward_accuracy/mean": 0.91015625, + "rewards/symbolic_reward_accuracy/std": 0.2862374484539032, + "rewards/symbolic_reward_partial_score/mean": 0.9334309697151184, + "rewards/symbolic_reward_partial_score/std": 0.229754239320755, + "rewards/tag_count_reward/mean": -0.06640625, + "rewards/tag_count_reward/std": 0.2492343932390213, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0394989252090454, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 404.0, + "sampling/sampling_logp_difference/mean": 4.205488204956055, + "step": 1865 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.28409726917743683, + "epoch": 4.910526315789474, + "grad_norm": 175.42616271972656, + "learning_rate": 1e-06, + "loss": 0.1474, + "step": 1866 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.1015625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2551625221967697, + "epoch": 4.913157894736842, + "grad_norm": 11.765541076660156, + "learning_rate": 1e-06, + "loss": 0.2086, + "step": 1867 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2779727131128311, + "epoch": 4.91578947368421, + "grad_norm": 0.00957444030791521, + "learning_rate": 1e-06, + "loss": 0.1011, + "step": 1868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.099609375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3578.0, + "completions/mean_length": 2560.607421875, + "completions/mean_terminated_length": 1031.33837890625, + "completions/min_length": 282.0, + "completions/min_terminated_length": 282.0, + "entropy": 0.26823192834854126, + "epoch": 4.918421052631579, + "frac_reward_zero_std": 0.28125, + "grad_norm": 1057.808837890625, + "learning_rate": 1e-06, + "loss": 0.1326, + "num_tokens": 611581346.0, + "reward": 0.7715161442756653, + "reward_std": 0.2120749056339264, + "rewards/progression_diversity/mean": -0.04077008739113808, + "rewards/progression_diversity/std": 0.13221383094787598, + "rewards/symbolic_reward_accuracy/mean": 0.8515625, + "rewards/symbolic_reward_accuracy/std": 0.35588082671165466, + "rewards/symbolic_reward_partial_score/mean": 0.8992512822151184, + "rewards/symbolic_reward_partial_score/std": 0.27642568945884705, + "rewards/tag_count_reward/mean": -0.087890625, + "rewards/tag_count_reward/std": 0.2834126651287079, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0262513160705566, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 404.0, + "sampling/sampling_logp_difference/mean": 5.402329444885254, + "step": 1869 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3515625, + "entropy": 0.27585209906101227, + "epoch": 4.921052631578947, + "grad_norm": 338.9614562988281, + "learning_rate": 1e-06, + "loss": 0.1617, + "step": 1870 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2421875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.26490674912929535, + "epoch": 4.923684210526316, + "grad_norm": 0.07172678411006927, + "learning_rate": 1e-06, + "loss": 0.1314, + "step": 1871 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2890625, + "clip_ratio/low_mean": 0.1640625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.453125, + "entropy": 0.2522326856851578, + "epoch": 4.926315789473684, + "grad_norm": 0.006790068931877613, + "learning_rate": 1e-06, + "loss": 0.2023, + "step": 1872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3835.0, + "completions/mean_length": 1870.408203125, + "completions/mean_terminated_length": 870.5156860351562, + "completions/min_length": 283.0, + "completions/min_terminated_length": 283.0, + "entropy": 0.29680676758289337, + "epoch": 4.928947368421053, + "frac_reward_zero_std": 0.4375, + "grad_norm": 459.56561279296875, + "learning_rate": 1e-06, + "loss": 0.0501, + "num_tokens": 612917779.0, + "reward": 0.8070130944252014, + "reward_std": 0.12357471138238907, + "rewards/progression_diversity/mean": -0.02623012289404869, + "rewards/progression_diversity/std": 0.10527035593986511, + "rewards/symbolic_reward_accuracy/mean": 0.892578125, + "rewards/symbolic_reward_accuracy/std": 0.30995169281959534, + "rewards/symbolic_reward_partial_score/mean": 0.9239909052848816, + "rewards/symbolic_reward_partial_score/std": 0.24311135709285736, + "rewards/tag_count_reward/mean": -0.0546875, + "rewards/tag_count_reward/std": 0.2275916188955307, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0434627532958984, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 408.0, + "sampling/sampling_logp_difference/mean": 3.528529167175293, + "step": 1873 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.296875, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.28874650597572327, + "epoch": 4.931578947368421, + "grad_norm": 229.14480590820312, + "learning_rate": 1e-06, + "loss": 0.1112, + "step": 1874 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.265625, + "entropy": 0.28308890759944916, + "epoch": 4.934210526315789, + "grad_norm": 0.023203283548355103, + "learning_rate": 1e-06, + "loss": 0.1112, + "step": 1875 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.2699621021747589, + "epoch": 4.936842105263158, + "grad_norm": 0.0074362908490002155, + "learning_rate": 1e-06, + "loss": 0.1392, + "step": 1876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.068359375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3640.0, + "completions/mean_length": 2017.177734375, + "completions/mean_terminated_length": 963.0083618164062, + "completions/min_length": 336.0, + "completions/min_terminated_length": 336.0, + "entropy": 0.2770151048898697, + "epoch": 4.939473684210526, + "frac_reward_zero_std": 0.4375, + "grad_norm": 661.4608154296875, + "learning_rate": 1e-06, + "loss": 0.1113, + "num_tokens": 614368398.0, + "reward": 0.8179945945739746, + "reward_std": 0.1273220181465149, + "rewards/progression_diversity/mean": -0.03159845247864723, + "rewards/progression_diversity/std": 0.12263655662536621, + "rewards/symbolic_reward_accuracy/mean": 0.90234375, + "rewards/symbolic_reward_accuracy/std": 0.29713961482048035, + "rewards/symbolic_reward_partial_score/mean": 0.9444986581802368, + "rewards/symbolic_reward_partial_score/std": 0.2035871148109436, + "rewards/tag_count_reward/mean": -0.064453125, + "rewards/tag_count_reward/std": 0.24579854309558868, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0411176681518555, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 408.0, + "sampling/sampling_logp_difference/mean": 3.9134116172790527, + "step": 1877 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.2913369834423065, + "epoch": 4.942105263157895, + "grad_norm": 0.01320668775588274, + "learning_rate": 1e-06, + "loss": 0.0404, + "step": 1878 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.140625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.28409533202648163, + "epoch": 4.9447368421052635, + "grad_norm": 0.03402348980307579, + "learning_rate": 1e-06, + "loss": 0.1155, + "step": 1879 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.203125, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2711239904165268, + "epoch": 4.947368421052632, + "grad_norm": 0.008342397399246693, + "learning_rate": 1e-06, + "loss": 0.1331, + "step": 1880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.056640625, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3740.0, + "completions/mean_length": 1834.041015625, + "completions/mean_terminated_length": 960.4410400390625, + "completions/min_length": 289.0, + "completions/min_terminated_length": 289.0, + "entropy": 0.2792969048023224, + "epoch": 4.95, + "frac_reward_zero_std": 0.3125, + "grad_norm": 335.7244873046875, + "learning_rate": 1e-06, + "loss": 0.125, + "num_tokens": 615720259.0, + "reward": 0.842618465423584, + "reward_std": 0.1381361484527588, + "rewards/progression_diversity/mean": -0.02526368759572506, + "rewards/progression_diversity/std": 0.10902338474988937, + "rewards/symbolic_reward_accuracy/mean": 0.93359375, + "rewards/symbolic_reward_accuracy/std": 0.2492343932390213, + "rewards/symbolic_reward_partial_score/mean": 0.9593098759651184, + "rewards/symbolic_reward_partial_score/std": 0.18112175166606903, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0347847938537598, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 410.0, + "sampling/sampling_logp_difference/mean": 4.509306907653809, + "step": 1881 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.25, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.34375, + "entropy": 0.2655886858701706, + "epoch": 4.9526315789473685, + "grad_norm": 0.01917690970003605, + "learning_rate": 1e-06, + "loss": 0.1565, + "step": 1882 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2734375, + "clip_ratio/low_mean": 0.0546875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.328125, + "entropy": 0.2937444746494293, + "epoch": 4.955263157894737, + "grad_norm": 0.027145760133862495, + "learning_rate": 1e-06, + "loss": 0.0847, + "step": 1883 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.328125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.359375, + "entropy": 0.2872210890054703, + "epoch": 4.957894736842105, + "grad_norm": 0.007199451327323914, + "learning_rate": 1e-06, + "loss": 0.1322, + "step": 1884 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.076171875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4932.0, + "completions/mean_length": 2128.708984375, + "completions/mean_terminated_length": 953.3255615234375, + "completions/min_length": 268.0, + "completions/min_terminated_length": 268.0, + "entropy": 0.2776035964488983, + "epoch": 4.9605263157894735, + "frac_reward_zero_std": 0.46875, + "grad_norm": 354.8702392578125, + "learning_rate": 1e-06, + "loss": 0.0456, + "num_tokens": 617215790.0, + "reward": 0.7808091640472412, + "reward_std": 0.14809326827526093, + "rewards/progression_diversity/mean": -0.029436133801937103, + "rewards/progression_diversity/std": 0.1100505068898201, + "rewards/symbolic_reward_accuracy/mean": 0.861328125, + "rewards/symbolic_reward_accuracy/std": 0.34594178199768066, + "rewards/symbolic_reward_partial_score/mean": 0.9044596552848816, + "rewards/symbolic_reward_partial_score/std": 0.26103639602661133, + "rewards/tag_count_reward/mean": -0.0703125, + "rewards/tag_count_reward/std": 0.25592297315597534, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.038970708847046, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 410.0, + "sampling/sampling_logp_difference/mean": 4.3476243019104, + "step": 1885 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.234375, + "entropy": 0.2802853584289551, + "epoch": 4.963157894736842, + "grad_norm": 0.026210030540823936, + "learning_rate": 1e-06, + "loss": 0.1114, + "step": 1886 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0625, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.292650431394577, + "epoch": 4.965789473684211, + "grad_norm": 0.017857909202575684, + "learning_rate": 1e-06, + "loss": 0.0637, + "step": 1887 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1796875, + "clip_ratio/low_mean": 0.1328125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3125, + "entropy": 0.258012056350708, + "epoch": 4.968421052631579, + "grad_norm": 0.031659141182899475, + "learning_rate": 1e-06, + "loss": 0.1985, + "step": 1888 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.052734375, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 4051.0, + "completions/mean_length": 1877.359375, + "completions/mean_terminated_length": 1069.773193359375, + "completions/min_length": 267.0, + "completions/min_terminated_length": 267.0, + "entropy": 0.28717607259750366, + "epoch": 4.971052631578948, + "frac_reward_zero_std": 0.34375, + "grad_norm": 143.55223083496094, + "learning_rate": 1e-06, + "loss": 0.0625, + "num_tokens": 618611846.0, + "reward": 0.8242976665496826, + "reward_std": 0.13553351163864136, + "rewards/progression_diversity/mean": -0.021410608664155006, + "rewards/progression_diversity/std": 0.09337057173252106, + "rewards/symbolic_reward_accuracy/mean": 0.90625, + "rewards/symbolic_reward_accuracy/std": 0.29176566004753113, + "rewards/symbolic_reward_partial_score/mean": 0.9527994990348816, + "rewards/symbolic_reward_partial_score/std": 0.17747651040554047, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0396955013275146, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 412.0, + "sampling/sampling_logp_difference/mean": 4.124094009399414, + "step": 1889 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.078125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2421875, + "entropy": 0.2870943546295166, + "epoch": 4.973684210526316, + "grad_norm": 0.014890086837112904, + "learning_rate": 1e-06, + "loss": 0.0991, + "step": 1890 + }, + { + "clip_ratio/high_max": 0.75, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0703125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2578125, + "entropy": 0.27629394829273224, + "epoch": 4.976315789473684, + "grad_norm": 0.010494343005120754, + "learning_rate": 1e-06, + "loss": 0.0717, + "step": 1891 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1875, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2734375, + "entropy": 0.26271922141313553, + "epoch": 4.978947368421053, + "grad_norm": 0.014826311729848385, + "learning_rate": 1e-06, + "loss": 0.1739, + "step": 1892 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.064453125, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3538.0, + "completions/mean_length": 1878.482421875, + "completions/mean_terminated_length": 879.1461791992188, + "completions/min_length": 249.0, + "completions/min_terminated_length": 249.0, + "entropy": 0.30142028629779816, + "epoch": 4.981578947368421, + "frac_reward_zero_std": 0.4375, + "grad_norm": 139.75131225585938, + "learning_rate": 1e-06, + "loss": 0.0789, + "num_tokens": 619963101.0, + "reward": 0.8057065010070801, + "reward_std": 0.13081881403923035, + "rewards/progression_diversity/mean": -0.025057487189769745, + "rewards/progression_diversity/std": 0.10004720836877823, + "rewards/symbolic_reward_accuracy/mean": 0.88671875, + "rewards/symbolic_reward_accuracy/std": 0.3172462284564972, + "rewards/symbolic_reward_partial_score/mean": 0.9339193105697632, + "rewards/symbolic_reward_partial_score/std": 0.21596600115299225, + "rewards/tag_count_reward/mean": -0.0625, + "rewards/tag_count_reward/std": 0.2422981858253479, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0438662767410278, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 412.0, + "sampling/sampling_logp_difference/mean": 4.115793228149414, + "step": 1893 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.28368599712848663, + "epoch": 4.984210526315789, + "grad_norm": 0.008541757240891457, + "learning_rate": 1e-06, + "loss": 0.1416, + "step": 1894 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.03125, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2890625, + "entropy": 0.2943654954433441, + "epoch": 4.9868421052631575, + "grad_norm": 0.007597525604069233, + "learning_rate": 1e-06, + "loss": 0.0159, + "step": 1895 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2109375, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.296875, + "entropy": 0.28492121398448944, + "epoch": 4.989473684210527, + "grad_norm": 0.008518518880009651, + "learning_rate": 1e-06, + "loss": 0.0887, + "step": 1896 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0546875, + "completions/max_length": 16384.0, + "completions/max_terminated_length": 3774.0, + "completions/mean_length": 1736.919921875, + "completions/mean_terminated_length": 889.568115234375, + "completions/min_length": 236.0, + "completions/min_terminated_length": 236.0, + "entropy": 0.30898527801036835, + "epoch": 4.992105263157895, + "frac_reward_zero_std": 0.53125, + "grad_norm": 330.7555847167969, + "learning_rate": 1e-06, + "loss": 0.0491, + "num_tokens": 621237364.0, + "reward": 0.8030959963798523, + "reward_std": 0.13730832934379578, + "rewards/progression_diversity/mean": -0.01755291409790516, + "rewards/progression_diversity/std": 0.07998932898044586, + "rewards/symbolic_reward_accuracy/mean": 0.8828125, + "rewards/symbolic_reward_accuracy/std": 0.32195815443992615, + "rewards/symbolic_reward_partial_score/mean": 0.9288737177848816, + "rewards/symbolic_reward_partial_score/std": 0.22742776572704315, + "rewards/tag_count_reward/mean": -0.05078125, + "rewards/tag_count_reward/std": 0.21976542472839355, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 1.0468072891235352, + "sampling/importance_sampling_ratio/min": 0.0, + "sampling/sampling_logp_difference/max": 412.0, + "sampling/sampling_logp_difference/mean": 3.8619487285614014, + "step": 1897 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1171875, + "clip_ratio/low_mean": 0.09375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.2109375, + "entropy": 0.29312750697135925, + "epoch": 4.994736842105263, + "grad_norm": 0.026777606457471848, + "learning_rate": 1e-06, + "loss": 0.1218, + "step": 1898 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.2578125, + "clip_ratio/low_mean": 0.046875, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.3046875, + "entropy": 0.30650024116039276, + "epoch": 4.997368421052632, + "grad_norm": 0.006984531879425049, + "learning_rate": 1e-06, + "loss": 0.0475, + "step": 1899 + }, + { + "clip_ratio/high_max": 1.0, + "clip_ratio/high_mean": 0.1640625, + "clip_ratio/low_mean": 0.0859375, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.25, + "entropy": 0.295014426112175, + "epoch": 5.0, + "grad_norm": 0.007008237764239311, + "learning_rate": 1e-06, + "loss": 0.1042, + "step": 1900 + }, + { + "epoch": 5.0, + "eval_clip_ratio/high_max": 0.0, + "eval_clip_ratio/high_mean": 0.0, + "eval_clip_ratio/low_mean": 0.0, + "eval_clip_ratio/low_min": 0.0, + "eval_clip_ratio/region_mean": 0.0, + "eval_completions/clipped_ratio": 0.015380859375, + "eval_completions/max_length": 13712.375, + "eval_completions/max_terminated_length": 2437.9375, + "eval_completions/mean_length": 905.335693359375, + "eval_completions/mean_terminated_length": 664.4008159637451, + "eval_completions/min_length": 251.21875, + "eval_completions/min_terminated_length": 251.21875, + "eval_entropy": 0.31546804029494524, + "eval_frac_reward_zero_std": 0.59765625, + "eval_loss": 0.018208162859082222, + "eval_num_tokens": 621237364.0, + "eval_reward": 0.8710558321326971, + "eval_reward_std": 0.08327648929116549, + "eval_rewards/progression_diversity/mean": -0.00623641712081735, + "eval_rewards/progression_diversity/std": 0.04451105743646622, + "eval_rewards/symbolic_reward_accuracy/mean": 0.964111328125, + "eval_rewards/symbolic_reward_accuracy/std": 0.1680670971982181, + "eval_rewards/symbolic_reward_partial_score/mean": 0.9798990897834301, + "eval_rewards/symbolic_reward_partial_score/std": 0.10754719079704955, + "eval_rewards/tag_count_reward/mean": -0.01318359375, + "eval_rewards/tag_count_reward/std": 0.09598715417087078, + "eval_runtime": 3296.8572, + "eval_samples_per_second": 0.076, + "eval_sampling/importance_sampling_ratio/max": 2.0, + "eval_sampling/importance_sampling_ratio/mean": 1.0692463777959347, + "eval_sampling/importance_sampling_ratio/min": 7.846449850264574e-05, + "eval_sampling/sampling_logp_difference/max": 337.0917568653822, + "eval_sampling/sampling_logp_difference/mean": 0.9625979592092335, + "eval_steps_per_second": 0.001, + "step": 1900 + }, + { + "epoch": 5.0, + "step": 1900, + "total_flos": 0.0, + "train_loss": 0.10312542560923918, + "train_runtime": 114002.2359, + "train_samples_per_second": 0.134, + "train_steps_per_second": 0.017 + } + ], + "logging_steps": 1, + "max_steps": 1900, + "num_input_tokens_seen": 621237364, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}